Support autoholding nodes for specific changes/refs

Add support for "scoped" autohold requests, making it possible
to create requests that match a given ref filter and hold nodes
for build that matches the filter.

Introduce two new arguments to `zuul autohold`: --ref and --change,
with --ref accepting regex that builds are matched against, and
--change being a shortcut that creates a filter for the specific change.

Change-Id: I801ba1af4b1bda46abff07791dd96828f2738621
This commit is contained in:
Krzysztof Klimonda 2017-10-25 12:16:47 +02:00
parent 1bb9171c43
commit 37d5403629
9 changed files with 260 additions and 16 deletions

View File

@ -1629,6 +1629,10 @@ class FakeNodepool(object):
nodeid = path.split("/")[-1]
return nodeid
def removeNode(self, node):
path = self.NODE_ROOT + '/' + node["_oid"]
self.client.delete(path, recursive=True)
def addFailRequest(self, request):
self.fail_requests.add(request['_oid'])

View File

@ -1507,7 +1507,7 @@ class TestScheduler(ZuulTestCase):
self.gearman_server.port)
self.addCleanup(client.shutdown)
r = client.autohold('tenant-one', 'org/project', 'project-test2',
"reason text", 1)
"", "", "reason text", 1)
self.assertTrue(r)
# First check that successful jobs do not autohold
@ -1554,7 +1554,7 @@ class TestScheduler(ZuulTestCase):
held_node['hold_job'],
" ".join(['tenant-one',
'review.example.com/org/project',
'project-test2'])
'project-test2', '.*'])
)
self.assertEqual(held_node['comment'], "reason text")
@ -1574,13 +1574,151 @@ class TestScheduler(ZuulTestCase):
held_nodes += 1
self.assertEqual(held_nodes, 1)
def _test_autohold_scoped(self, change_obj, change, ref):
client = zuul.rpcclient.RPCClient('127.0.0.1',
self.gearman_server.port)
self.addCleanup(client.shutdown)
# create two changes on the same project, and autohold request
# for one of them.
other = self.fake_gerrit.addFakeChange(
'org/project', 'master', 'other'
)
r = client.autohold('tenant-one', 'org/project', 'project-test2',
str(change), ref, "reason text", 1)
self.assertTrue(r)
# First, check that an unrelated job does not trigger autohold, even
# when it failed
self.executor_server.failJob('project-test2', other)
self.fake_gerrit.addEvent(other.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.assertEqual(other.data['status'], 'NEW')
self.assertEqual(other.reported, 1)
# project-test2
self.assertEqual(self.history[0].result, 'FAILURE')
# Check nodepool for a held node
held_node = None
for node in self.fake_nodepool.getNodes():
if node['state'] == zuul.model.STATE_HOLD:
held_node = node
break
self.assertIsNone(held_node)
# And then verify that failed job for the defined change
# triggers the autohold
self.executor_server.failJob('project-test2', change_obj)
self.fake_gerrit.addEvent(change_obj.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.assertEqual(change_obj.data['status'], 'NEW')
self.assertEqual(change_obj.reported, 1)
# project-test2
self.assertEqual(self.history[1].result, 'FAILURE')
# Check nodepool for a held node
held_node = None
for node in self.fake_nodepool.getNodes():
if node['state'] == zuul.model.STATE_HOLD:
held_node = node
break
self.assertIsNotNone(held_node)
# Validate node has recorded the failed job
if change != "":
ref = "refs/changes/%s/%s/.*" % (
str(change_obj.number)[-1:], str(change_obj.number)
)
self.assertEqual(
held_node['hold_job'],
" ".join(['tenant-one',
'review.example.com/org/project',
'project-test2', ref])
)
self.assertEqual(held_node['comment'], "reason text")
@simple_layout('layouts/autohold.yaml')
def test_autohold_change(self):
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
self._test_autohold_scoped(A, change=A.number, ref="")
@simple_layout('layouts/autohold.yaml')
def test_autohold_ref(self):
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
ref = A.data['currentPatchSet']['ref']
self._test_autohold_scoped(A, change="", ref=ref)
@simple_layout('layouts/autohold.yaml')
def test_autohold_scoping(self):
client = zuul.rpcclient.RPCClient('127.0.0.1',
self.gearman_server.port)
self.addCleanup(client.shutdown)
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
# create three autohold requests, scoped to job, change and
# a specific ref
change = str(A.number)
ref = A.data['currentPatchSet']['ref']
r1 = client.autohold('tenant-one', 'org/project', 'project-test2',
"", "", "reason text", 1)
self.assertTrue(r1)
r2 = client.autohold('tenant-one', 'org/project', 'project-test2',
change, "", "reason text", 1)
self.assertTrue(r2)
r3 = client.autohold('tenant-one', 'org/project', 'project-test2',
"", ref, "reason text", 1)
self.assertTrue(r3)
# Fail 3 jobs for the same change, and verify that the autohold
# requests are fullfilled in the expected order: from the most
# specific towards the most generic one.
def _fail_job_and_verify_autohold_request(change_obj, ref_filter):
self.executor_server.failJob('project-test2', change_obj)
self.fake_gerrit.addEvent(change_obj.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
# Check nodepool for a held node
held_node = None
for node in self.fake_nodepool.getNodes():
if node['state'] == zuul.model.STATE_HOLD:
held_node = node
break
self.assertIsNotNone(held_node)
self.assertEqual(
held_node['hold_job'],
" ".join(['tenant-one',
'review.example.com/org/project',
'project-test2', ref_filter])
)
self.assertFalse(held_node['_lock'], "Node %s is locked" %
(node['_oid'],))
self.fake_nodepool.removeNode(held_node)
_fail_job_and_verify_autohold_request(A, ref)
ref = "refs/changes/%s/%s/.*" % (str(change)[-1:], str(change))
_fail_job_and_verify_autohold_request(A, ref)
_fail_job_and_verify_autohold_request(A, ".*")
@simple_layout('layouts/autohold.yaml')
def test_autohold_ignores_aborted_jobs(self):
client = zuul.rpcclient.RPCClient('127.0.0.1',
self.gearman_server.port)
self.addCleanup(client.shutdown)
r = client.autohold('tenant-one', 'org/project', 'project-test2',
"reason text", 1)
"", "", "reason text", 1)
self.assertTrue(r)
self.executor_server.hold_jobs_in_build = True
@ -1624,7 +1762,7 @@ class TestScheduler(ZuulTestCase):
self.addCleanup(client.shutdown)
r = client.autohold('tenant-one', 'org/project', 'project-test2',
"reason text", 1)
"", "", "reason text", 1)
self.assertTrue(r)
autohold_requests = client.autohold_list()
@ -1633,11 +1771,12 @@ class TestScheduler(ZuulTestCase):
# The single dict key should be a CSV string value
key = list(autohold_requests.keys())[0]
tenant, project, job = key.split(',')
tenant, project, job, ref_filter = key.split(',')
self.assertEqual('tenant-one', tenant)
self.assertIn('org/project', project)
self.assertEqual('project-test2', job)
self.assertEqual(".*", ref_filter)
# Note: the value is converted from set to list by json.
self.assertEqual([1, "reason text"], autohold_requests[key])

View File

@ -51,6 +51,11 @@ class Client(zuul.cmd.ZuulApp):
required=True)
cmd_autohold.add_argument('--job', help='job name',
required=True)
cmd_autohold.add_argument('--change',
help='specific change to hold nodes for',
required=False, default='')
cmd_autohold.add_argument('--ref', help='git ref to hold nodes for',
required=False, default='')
cmd_autohold.add_argument('--reason', help='reason for the hold',
required=True)
cmd_autohold.add_argument('--count',
@ -173,9 +178,15 @@ class Client(zuul.cmd.ZuulApp):
def autohold(self):
client = zuul.rpcclient.RPCClient(
self.server, self.port, self.ssl_key, self.ssl_cert, self.ssl_ca)
if self.args.change and self.args.ref:
print("Change and ref can't be both used for the same request")
return False
r = client.autohold(tenant=self.args.tenant,
project=self.args.project,
job=self.args.job,
change=self.args.change,
ref=self.args.ref,
reason=self.args.reason,
count=self.args.count)
return r
@ -190,14 +201,19 @@ class Client(zuul.cmd.ZuulApp):
return True
table = prettytable.PrettyTable(
field_names=['Tenant', 'Project', 'Job', 'Count', 'Reason'])
field_names=[
'Tenant', 'Project', 'Job', 'Ref Filter', 'Count', 'Reason'
])
for key, value in autohold_requests.items():
# The key comes to us as a CSV string because json doesn't like
# non-str keys.
tenant_name, project_name, job_name = key.split(',')
tenant_name, project_name, job_name, ref_filter = key.split(',')
count, reason = value
table.add_row([tenant_name, project_name, job_name, count, reason])
table.add_row([
tenant_name, project_name, job_name, ref_filter, count, reason
])
print(table)
return True

View File

@ -138,6 +138,10 @@ class GerritSource(BaseSource):
)
return [f]
def getRefForChange(self, change):
partial = change[-2:]
return "refs/changes/%s/%s/.*" % (partial, change)
approval = vs.Schema({'username': str,
'email': str,

View File

@ -68,3 +68,6 @@ class GitSource(BaseSource):
def getRejectFilters(self, config):
return []
def getRefForChange(self, change):
raise NotImplemented()

View File

@ -144,6 +144,9 @@ class GithubSource(BaseSource):
)
return [f]
def getRefForChange(self, change):
return "refs/pull/%s/head" % change
review = v.Schema({'username': str,
'email': str,

View File

@ -48,10 +48,12 @@ class RPCClient(object):
self.log.debug("Job complete, success: %s" % (not job.failure))
return job
def autohold(self, tenant, project, job, reason, count):
def autohold(self, tenant, project, job, change, ref, reason, count):
data = {'tenant': tenant,
'project': project,
'job': job,
'change': change,
'ref': ref,
'reason': reason,
'count': count}
return not self.submitJob('zuul:autohold', data).failure

View File

@ -150,7 +150,20 @@ class RPCListener(object):
job.sendWorkException(error.encode('utf8'))
return
if args['change'] and args['ref']:
job.sendWorkException("Change and ref can't be both used "
"for the same request")
if args['change']:
# Convert change into ref based on zuul connection
ref_filter = project.source.getRefForChange(args['change'])
elif args['ref']:
ref_filter = "%s" % args['ref']
else:
ref_filter = ".*"
params['job_name'] = args['job']
params['ref_filter'] = ref_filter
params['reason'] = args['reason']
if args['count'] < 0:

View File

@ -19,6 +19,7 @@ import json
import logging
import os
import pickle
import re
import queue
import socket
import sys
@ -436,8 +437,9 @@ class Scheduler(threading.Thread):
self.last_reconfigured = int(time.time())
# TODOv3(jeblair): reconfigure time should be per-tenant
def autohold(self, tenant_name, project_name, job_name, reason, count):
key = (tenant_name, project_name, job_name)
def autohold(self, tenant_name, project_name, job_name, ref_filter,
reason, count):
key = (tenant_name, project_name, job_name, ref_filter)
if count == 0 and key in self.autohold_requests:
self.log.debug("Removing autohold for %s", key)
del self.autohold_requests[key]
@ -973,6 +975,66 @@ class Scheduler(threading.Thread):
self.log.exception("Exception estimating build time:")
pipeline.manager.onBuildStarted(event.build)
def _getAutoholdRequestKey(self, build):
change = build.build_set.item.change
autohold_key_base = (build.pipeline.layout.tenant.name,
change.project.canonical_name,
build.job.name)
class Scope(object):
"""Enum defining a precedence/priority of autohold requests.
Autohold requests for specific refs should be fulfilled first,
before those for changes, and generic jobs.
Matching algorithm goes over all existing autohold requests, and
returns one with the highest number (in case of duplicated
requests the last one wins).
"""
NONE = 0
JOB = 1
CHANGE = 2
REF = 3
def autohold_key_base_issubset(base, request_key):
"""check whether the given key is a subset of the build key"""
index = 0
base_len = len(base)
while index < base_len:
if base[index] != request_key[index]:
return False
index += 1
return True
# Do a partial match of the autohold key against all autohold
# requests, ignoring the last element of the key (ref filter),
# and finally do a regex match between ref filter from
# the autohold request and the build's change ref to check
# if it matches. Lastly, make sure that we match the most
# specific autohold request by comparing "scopes"
# of requests - the most specific is selected.
autohold_key = None
scope = Scope.NONE
for request in self.autohold_requests:
ref_filter = request[-1]
if not autohold_key_base_issubset(autohold_key_base, request) \
or not re.match(ref_filter, change.ref):
continue
if ref_filter == ".*":
candidate_scope = Scope.JOB
elif ref_filter.endswith(".*"):
candidate_scope = Scope.CHANGE
else:
candidate_scope = Scope.REF
if candidate_scope > scope:
scope = candidate_scope
autohold_key = request
return autohold_key
def _doBuildCompletedEvent(self, event):
build = event.build
@ -981,11 +1043,9 @@ class Scheduler(threading.Thread):
# the nodes to nodepool.
try:
nodeset = build.nodeset
autohold_key = (build.pipeline.layout.tenant.name,
build.build_set.item.change.project.canonical_name,
build.job.name)
if (build.result == "FAILURE" and
autohold_key in self.autohold_requests):
autohold_key = self._getAutoholdRequestKey(build)
if (build.result == "FAILURE" and autohold_key):
# We explicitly only want to hold nodes for jobs if they have
# failed and have an autohold request.
try: