Merge "Fix canceling builds in starting phase"

This commit is contained in:
Zuul 2020-01-24 16:06:26 +00:00 committed by Gerrit Code Review
commit 83175b3fe0
4 changed files with 91 additions and 18 deletions

View File

@ -2210,6 +2210,12 @@ class RecordingAnsibleJob(zuul.executor.server.AnsibleJob):
merger, items, repo_state)
if not commit: # merge conflict
self.recordResult('MERGER_FAILURE')
for _ in iterate_timeout(60, 'wait for merge'):
if not self.executor_server.hold_jobs_in_start:
break
time.sleep(1)
return commit
def recordResult(self, result):
@ -2284,6 +2290,10 @@ class RecordingAnsibleJob(zuul.executor.server.AnsibleJob):
build.paused = False
super().resume()
def _send_aborted(self):
self.recordResult('ABORTED')
super()._send_aborted()
class RecordingMergeClient(zuul.merger.client.MergeClient):
@ -2322,6 +2332,7 @@ class RecordingExecutorServer(zuul.executor.server.ExecutorServer):
self._ansible_manager_class = FakeAnsibleManager
super(RecordingExecutorServer, self).__init__(*args, **kw)
self.hold_jobs_in_build = False
self.hold_jobs_in_start = False
self.lock = threading.Lock()
self.running_builds = []
self.build_history = []

View File

@ -2540,6 +2540,33 @@ class TestScheduler(ZuulTestCase):
self.assertEqual(A.reported, 0, "Abandoned change should not report")
self.assertEqual(B.reported, 1, "Change should report")
def test_cancel_starting_build(self):
"Test that a canceled build that is not processed yet is removed"
self.executor_server.hold_jobs_in_start = True
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
for _ in iterate_timeout(30, 'Wait for build to be in starting phase'):
if self.executor_server.job_workers:
break
# Abandon change to cancel build
self.fake_gerrit.addEvent(A.getChangeAbandonedEvent())
for _ in iterate_timeout(30, 'Wait for executor:stop request'):
stop_jobs = [x for x in self.gearman_server.jobs_history
if b'executor:stop' in x.name]
if stop_jobs:
break
self.executor_server.hold_jobs_in_start = False
self.waitUntilSettled()
self.assertHistory([
dict(name='project-merge', result='ABORTED')
])
def test_abandoned_not_timer(self):
"Test that an abandoned change does not cancel timer jobs"
# This test can not use simple_layout because it must start

View File

@ -389,8 +389,7 @@ class ExecutorClient(object):
log.debug("Build has no associated gearman job")
return False
# TODOv3(jeblair): make a nicer way of recording build start.
if build.url is not None:
if build.__gearman_worker is not None:
log.debug("Build has already started")
self.cancelRunningBuild(build)
log.debug("Canceled running build")
@ -406,12 +405,12 @@ class ExecutorClient(object):
time.sleep(1)
log.debug("Still unable to find build to cancel")
if build.url:
if build.__gearman_worker is not None:
log.debug("Build has just started")
self.cancelRunningBuild(build)
log.debug("Canceled running build")
return True
log.debug("Unable to cancel build")
log.error("Unable to cancel build")
def onBuildCompleted(self, job, result=None):
if job.unique in self.meta_jobs:
@ -487,6 +486,7 @@ class ExecutorClient(object):
build.url = data.get('url', build.url)
# Update information about worker
build.worker.updateFromData(data)
build.__gearman_worker = build.worker.name
if 'paused' in data and build.paused != data['paused']:
build.paused = data['paused']
@ -496,7 +496,6 @@ class ExecutorClient(object):
if not started:
self.log.info("Build %s started" % job)
build.__gearman_worker = data.get('worker_name')
self.sched.onBuildStarted(build)
else:
self.log.error("Unable to find build %s" % job.unique)

View File

@ -829,6 +829,10 @@ class AnsibleJob(object):
def execute(self):
try:
self.time_starting_build = time.monotonic()
# report that job has been taken
self.job.sendWorkData(json.dumps(self._base_job_data()))
self.ssh_agent.start()
self.ssh_agent.add(self.private_key_file)
for key in self.arguments.get('ssh_keys', []):
@ -862,6 +866,22 @@ class AnsibleJob(object):
except Exception:
self.log.exception("Error finalizing job thread:")
def _base_job_data(self):
return {
# TODO(mordred) worker_name is needed as a unique name for the
# client to use for cancelling jobs on an executor. It's
# defaulting to the hostname for now, but in the future we
# should allow setting a per-executor override so that one can
# run more than one executor on a host.
'worker_name': self.executor_server.hostname,
'worker_hostname': self.executor_server.hostname,
'worker_log_port': self.executor_server.log_streaming_port,
}
def _send_aborted(self):
result = dict(result='ABORTED')
self.job.sendWorkComplete(json.dumps(result))
def _execute(self):
args = self.arguments
self.log.info(
@ -914,6 +934,11 @@ class AnsibleJob(object):
'branches': task.branches,
}
# Early abort if abort requested
if self.aborted:
self._send_aborted()
return
self.log.debug("Git updates complete")
merger = self.executor_server._getMerger(
self.jobdir.src_root,
@ -939,10 +964,20 @@ class AnsibleJob(object):
# a work complete result, don't run any jobs
return
# Early abort if abort requested
if self.aborted:
self._send_aborted()
return
state_items = [i for i in args['items'] if not i.get('number')]
if state_items:
merger.setRepoState(state_items, repo_state)
# Early abort if abort requested
if self.aborted:
self._send_aborted()
return
for project in args['projects']:
repo = repos[project['canonical_name']]
# If this project is the Zuul project and this is a ref
@ -979,30 +1014,31 @@ class AnsibleJob(object):
for repo in repos.values():
repo.setRemoteUrl('file:///dev/null')
# Early abort if abort requested
if self.aborted:
self._send_aborted()
return
# This prepares each playbook and the roles needed for each.
self.preparePlaybooks(args)
self.prepareAnsibleFiles(args)
self.writeLoggingConfig()
data = {
# TODO(mordred) worker_name is needed as a unique name for the
# client to use for cancelling jobs on an executor. It's defaulting
# to the hostname for now, but in the future we should allow
# setting a per-executor override so that one can run more than
# one executor on a host.
'worker_name': self.executor_server.hostname,
'worker_hostname': self.executor_server.hostname,
'worker_log_port': self.executor_server.log_streaming_port
}
# Early abort if abort requested
if self.aborted:
self._send_aborted()
return
data = self._base_job_data()
if self.executor_server.log_streaming_port != DEFAULT_FINGER_PORT:
data['url'] = "finger://{hostname}:{port}/{uuid}".format(
hostname=data['worker_hostname'],
port=data['worker_log_port'],
hostname=self.executor_server.hostname,
port=self.executor_server.log_streaming_port,
uuid=self.job.unique)
else:
data['url'] = 'finger://{hostname}/{uuid}'.format(
hostname=data['worker_hostname'],
hostname=self.executor_server.hostname,
uuid=self.job.unique)
self.job.sendWorkData(json.dumps(data))