zuul autohold: allow operator to specify nodes TTL
Add the option --node-hold-expiration to `zuul autohold`. This parameter allows an operator to specify how long a node set should remain in HOLD state after a build failure. Change-Id: I25020d1722de97426e6699653ff72eba03c46b16 Depends-On: I9a09728e5728c537ee44721f5d5e774dc0dcefa7
This commit is contained in:
parent
f5ea4cf8a3
commit
e4bf201286
@ -1667,7 +1667,8 @@ class FakeNodepool(object):
|
||||
updated_time=now,
|
||||
image_id=None,
|
||||
host_keys=host_keys,
|
||||
executor='fake-nodepool')
|
||||
executor='fake-nodepool',
|
||||
hold_expiration=None)
|
||||
if self.remote_ansible:
|
||||
data['connection_type'] = 'ssh'
|
||||
if 'fakeuser' in node_type:
|
||||
|
@ -1755,6 +1755,38 @@ class TestScheduler(ZuulTestCase):
|
||||
break
|
||||
self.assertIsNone(held_node)
|
||||
|
||||
@simple_layout('layouts/autohold.yaml')
|
||||
def test_autohold_hold_expiration(self):
|
||||
client = zuul.rpcclient.RPCClient('127.0.0.1',
|
||||
self.gearman_server.port)
|
||||
self.addCleanup(client.shutdown)
|
||||
r = client.autohold('tenant-one', 'org/project', 'project-test2',
|
||||
"", "", "reason text", 1, node_hold_expiration=30)
|
||||
self.assertTrue(r)
|
||||
|
||||
# Hold a failed job
|
||||
B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
|
||||
self.executor_server.failJob('project-test2', B)
|
||||
self.fake_gerrit.addEvent(B.getPatchsetCreatedEvent(1))
|
||||
|
||||
self.waitUntilSettled()
|
||||
|
||||
self.assertEqual(B.data['status'], 'NEW')
|
||||
self.assertEqual(B.reported, 1)
|
||||
# project-test2
|
||||
self.assertEqual(self.history[0].result, 'FAILURE')
|
||||
|
||||
# Check nodepool for a held node
|
||||
held_node = None
|
||||
for node in self.fake_nodepool.getNodes():
|
||||
if node['state'] == zuul.model.STATE_HOLD:
|
||||
held_node = node
|
||||
break
|
||||
self.assertIsNotNone(held_node)
|
||||
|
||||
# Validate node has hold_expiration property
|
||||
self.assertEqual(int(held_node['hold_expiration']), 30)
|
||||
|
||||
@simple_layout('layouts/autohold.yaml')
|
||||
def test_autohold_list(self):
|
||||
client = zuul.rpcclient.RPCClient('127.0.0.1',
|
||||
@ -1779,7 +1811,7 @@ class TestScheduler(ZuulTestCase):
|
||||
self.assertEqual(".*", ref_filter)
|
||||
|
||||
# Note: the value is converted from set to list by json.
|
||||
self.assertEqual([1, "reason text"], autohold_requests[key])
|
||||
self.assertEqual([1, "reason text", None], autohold_requests[key])
|
||||
|
||||
@simple_layout('layouts/three-projects.yaml')
|
||||
def test_dependent_behind_dequeue(self):
|
||||
|
@ -61,6 +61,12 @@ class Client(zuul.cmd.ZuulApp):
|
||||
cmd_autohold.add_argument('--count',
|
||||
help='number of job runs (default: 1)',
|
||||
required=False, type=int, default=1)
|
||||
cmd_autohold.add_argument('--node-hold-expiration',
|
||||
help=('how long in seconds should the '
|
||||
'node set be in HOLD status '
|
||||
'(default: nodepool\'s max-hold-age '
|
||||
'if set, or indefinitely)'),
|
||||
required=False, default=0)
|
||||
cmd_autohold.set_defaults(func=self.autohold)
|
||||
|
||||
cmd_autohold_list = subparsers.add_parser(
|
||||
@ -182,13 +188,15 @@ class Client(zuul.cmd.ZuulApp):
|
||||
print("Change and ref can't be both used for the same request")
|
||||
return False
|
||||
|
||||
node_hold_expiration = self.args.node_hold_expiration
|
||||
r = client.autohold(tenant=self.args.tenant,
|
||||
project=self.args.project,
|
||||
job=self.args.job,
|
||||
change=self.args.change,
|
||||
ref=self.args.ref,
|
||||
reason=self.args.reason,
|
||||
count=self.args.count)
|
||||
count=self.args.count,
|
||||
node_hold_expiration=node_hold_expiration)
|
||||
return r
|
||||
|
||||
def autohold_list(self):
|
||||
@ -209,7 +217,7 @@ class Client(zuul.cmd.ZuulApp):
|
||||
# The key comes to us as a CSV string because json doesn't like
|
||||
# non-str keys.
|
||||
tenant_name, project_name, job_name, ref_filter = key.split(',')
|
||||
count, reason = value
|
||||
count, reason, node_hold_expiration = value
|
||||
|
||||
table.add_row([
|
||||
tenant_name, project_name, job_name, ref_filter, count, reason
|
||||
|
@ -390,6 +390,7 @@ class Node(object):
|
||||
self.provider = None
|
||||
self.region = None
|
||||
self.username = None
|
||||
self.hold_expiration = None
|
||||
|
||||
@property
|
||||
def state(self):
|
||||
|
@ -88,7 +88,9 @@ class Nodepool(object):
|
||||
associated with the given NodeSet.
|
||||
'''
|
||||
self.log.info("Holding nodeset %s" % (nodeset,))
|
||||
(hold_iterations, reason) = self.sched.autohold_requests[autohold_key]
|
||||
(hold_iterations,
|
||||
reason,
|
||||
node_hold_expiration) = self.sched.autohold_requests[autohold_key]
|
||||
nodes = nodeset.getNodes()
|
||||
|
||||
for node in nodes:
|
||||
@ -97,6 +99,8 @@ class Nodepool(object):
|
||||
node.state = model.STATE_HOLD
|
||||
node.hold_job = " ".join(autohold_key)
|
||||
node.comment = reason
|
||||
if node_hold_expiration:
|
||||
node.hold_expiration = node_hold_expiration
|
||||
self.sched.zk.storeNode(node)
|
||||
|
||||
# We remove the autohold when the number of nodes in hold
|
||||
|
@ -48,14 +48,16 @@ class RPCClient(object):
|
||||
self.log.debug("Job complete, success: %s" % (not job.failure))
|
||||
return job
|
||||
|
||||
def autohold(self, tenant, project, job, change, ref, reason, count):
|
||||
def autohold(self, tenant, project, job, change, ref, reason, count,
|
||||
node_hold_expiration=None):
|
||||
data = {'tenant': tenant,
|
||||
'project': project,
|
||||
'job': job,
|
||||
'change': change,
|
||||
'ref': ref,
|
||||
'reason': reason,
|
||||
'count': count}
|
||||
'count': count,
|
||||
'node_hold_expiration': node_hold_expiration}
|
||||
return not self.submitJob('zuul:autohold', data).failure
|
||||
|
||||
def autohold_list(self):
|
||||
|
@ -172,6 +172,7 @@ class RPCListener(object):
|
||||
return
|
||||
|
||||
params['count'] = args['count']
|
||||
params['node_hold_expiration'] = args['node_hold_expiration']
|
||||
|
||||
self.sched.autohold(**params)
|
||||
job.sendWorkComplete()
|
||||
|
@ -439,14 +439,14 @@ class Scheduler(threading.Thread):
|
||||
# TODOv3(jeblair): reconfigure time should be per-tenant
|
||||
|
||||
def autohold(self, tenant_name, project_name, job_name, ref_filter,
|
||||
reason, count):
|
||||
reason, count, node_hold_expiration):
|
||||
key = (tenant_name, project_name, job_name, ref_filter)
|
||||
if count == 0 and key in self.autohold_requests:
|
||||
self.log.debug("Removing autohold for %s", key)
|
||||
del self.autohold_requests[key]
|
||||
else:
|
||||
self.log.debug("Autohold requested for %s", key)
|
||||
self.autohold_requests[key] = (count, reason)
|
||||
self.autohold_requests[key] = (count, reason, node_hold_expiration)
|
||||
|
||||
def promote(self, tenant_name, pipeline_name, change_ids):
|
||||
event = PromoteEvent(tenant_name, pipeline_name, change_ids)
|
||||
|
Loading…
Reference in New Issue
Block a user