zuul autohold: allow operator to specify nodes TTL
Add the option --node-hold-expiration to `zuul autohold`. This parameter allows an operator to specify how long a node set should remain in HOLD state after a build failure. Change-Id: I25020d1722de97426e6699653ff72eba03c46b16 Depends-On: I9a09728e5728c537ee44721f5d5e774dc0dcefa7
This commit is contained in:
parent
f5ea4cf8a3
commit
e4bf201286
@ -1667,7 +1667,8 @@ class FakeNodepool(object):
|
|||||||
updated_time=now,
|
updated_time=now,
|
||||||
image_id=None,
|
image_id=None,
|
||||||
host_keys=host_keys,
|
host_keys=host_keys,
|
||||||
executor='fake-nodepool')
|
executor='fake-nodepool',
|
||||||
|
hold_expiration=None)
|
||||||
if self.remote_ansible:
|
if self.remote_ansible:
|
||||||
data['connection_type'] = 'ssh'
|
data['connection_type'] = 'ssh'
|
||||||
if 'fakeuser' in node_type:
|
if 'fakeuser' in node_type:
|
||||||
|
@ -1755,6 +1755,38 @@ class TestScheduler(ZuulTestCase):
|
|||||||
break
|
break
|
||||||
self.assertIsNone(held_node)
|
self.assertIsNone(held_node)
|
||||||
|
|
||||||
|
@simple_layout('layouts/autohold.yaml')
|
||||||
|
def test_autohold_hold_expiration(self):
|
||||||
|
client = zuul.rpcclient.RPCClient('127.0.0.1',
|
||||||
|
self.gearman_server.port)
|
||||||
|
self.addCleanup(client.shutdown)
|
||||||
|
r = client.autohold('tenant-one', 'org/project', 'project-test2',
|
||||||
|
"", "", "reason text", 1, node_hold_expiration=30)
|
||||||
|
self.assertTrue(r)
|
||||||
|
|
||||||
|
# Hold a failed job
|
||||||
|
B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
|
||||||
|
self.executor_server.failJob('project-test2', B)
|
||||||
|
self.fake_gerrit.addEvent(B.getPatchsetCreatedEvent(1))
|
||||||
|
|
||||||
|
self.waitUntilSettled()
|
||||||
|
|
||||||
|
self.assertEqual(B.data['status'], 'NEW')
|
||||||
|
self.assertEqual(B.reported, 1)
|
||||||
|
# project-test2
|
||||||
|
self.assertEqual(self.history[0].result, 'FAILURE')
|
||||||
|
|
||||||
|
# Check nodepool for a held node
|
||||||
|
held_node = None
|
||||||
|
for node in self.fake_nodepool.getNodes():
|
||||||
|
if node['state'] == zuul.model.STATE_HOLD:
|
||||||
|
held_node = node
|
||||||
|
break
|
||||||
|
self.assertIsNotNone(held_node)
|
||||||
|
|
||||||
|
# Validate node has hold_expiration property
|
||||||
|
self.assertEqual(int(held_node['hold_expiration']), 30)
|
||||||
|
|
||||||
@simple_layout('layouts/autohold.yaml')
|
@simple_layout('layouts/autohold.yaml')
|
||||||
def test_autohold_list(self):
|
def test_autohold_list(self):
|
||||||
client = zuul.rpcclient.RPCClient('127.0.0.1',
|
client = zuul.rpcclient.RPCClient('127.0.0.1',
|
||||||
@ -1779,7 +1811,7 @@ class TestScheduler(ZuulTestCase):
|
|||||||
self.assertEqual(".*", ref_filter)
|
self.assertEqual(".*", ref_filter)
|
||||||
|
|
||||||
# Note: the value is converted from set to list by json.
|
# Note: the value is converted from set to list by json.
|
||||||
self.assertEqual([1, "reason text"], autohold_requests[key])
|
self.assertEqual([1, "reason text", None], autohold_requests[key])
|
||||||
|
|
||||||
@simple_layout('layouts/three-projects.yaml')
|
@simple_layout('layouts/three-projects.yaml')
|
||||||
def test_dependent_behind_dequeue(self):
|
def test_dependent_behind_dequeue(self):
|
||||||
|
@ -61,6 +61,12 @@ class Client(zuul.cmd.ZuulApp):
|
|||||||
cmd_autohold.add_argument('--count',
|
cmd_autohold.add_argument('--count',
|
||||||
help='number of job runs (default: 1)',
|
help='number of job runs (default: 1)',
|
||||||
required=False, type=int, default=1)
|
required=False, type=int, default=1)
|
||||||
|
cmd_autohold.add_argument('--node-hold-expiration',
|
||||||
|
help=('how long in seconds should the '
|
||||||
|
'node set be in HOLD status '
|
||||||
|
'(default: nodepool\'s max-hold-age '
|
||||||
|
'if set, or indefinitely)'),
|
||||||
|
required=False, default=0)
|
||||||
cmd_autohold.set_defaults(func=self.autohold)
|
cmd_autohold.set_defaults(func=self.autohold)
|
||||||
|
|
||||||
cmd_autohold_list = subparsers.add_parser(
|
cmd_autohold_list = subparsers.add_parser(
|
||||||
@ -182,13 +188,15 @@ class Client(zuul.cmd.ZuulApp):
|
|||||||
print("Change and ref can't be both used for the same request")
|
print("Change and ref can't be both used for the same request")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
node_hold_expiration = self.args.node_hold_expiration
|
||||||
r = client.autohold(tenant=self.args.tenant,
|
r = client.autohold(tenant=self.args.tenant,
|
||||||
project=self.args.project,
|
project=self.args.project,
|
||||||
job=self.args.job,
|
job=self.args.job,
|
||||||
change=self.args.change,
|
change=self.args.change,
|
||||||
ref=self.args.ref,
|
ref=self.args.ref,
|
||||||
reason=self.args.reason,
|
reason=self.args.reason,
|
||||||
count=self.args.count)
|
count=self.args.count,
|
||||||
|
node_hold_expiration=node_hold_expiration)
|
||||||
return r
|
return r
|
||||||
|
|
||||||
def autohold_list(self):
|
def autohold_list(self):
|
||||||
@ -209,7 +217,7 @@ class Client(zuul.cmd.ZuulApp):
|
|||||||
# The key comes to us as a CSV string because json doesn't like
|
# The key comes to us as a CSV string because json doesn't like
|
||||||
# non-str keys.
|
# non-str keys.
|
||||||
tenant_name, project_name, job_name, ref_filter = key.split(',')
|
tenant_name, project_name, job_name, ref_filter = key.split(',')
|
||||||
count, reason = value
|
count, reason, node_hold_expiration = value
|
||||||
|
|
||||||
table.add_row([
|
table.add_row([
|
||||||
tenant_name, project_name, job_name, ref_filter, count, reason
|
tenant_name, project_name, job_name, ref_filter, count, reason
|
||||||
|
@ -390,6 +390,7 @@ class Node(object):
|
|||||||
self.provider = None
|
self.provider = None
|
||||||
self.region = None
|
self.region = None
|
||||||
self.username = None
|
self.username = None
|
||||||
|
self.hold_expiration = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def state(self):
|
def state(self):
|
||||||
|
@ -88,7 +88,9 @@ class Nodepool(object):
|
|||||||
associated with the given NodeSet.
|
associated with the given NodeSet.
|
||||||
'''
|
'''
|
||||||
self.log.info("Holding nodeset %s" % (nodeset,))
|
self.log.info("Holding nodeset %s" % (nodeset,))
|
||||||
(hold_iterations, reason) = self.sched.autohold_requests[autohold_key]
|
(hold_iterations,
|
||||||
|
reason,
|
||||||
|
node_hold_expiration) = self.sched.autohold_requests[autohold_key]
|
||||||
nodes = nodeset.getNodes()
|
nodes = nodeset.getNodes()
|
||||||
|
|
||||||
for node in nodes:
|
for node in nodes:
|
||||||
@ -97,6 +99,8 @@ class Nodepool(object):
|
|||||||
node.state = model.STATE_HOLD
|
node.state = model.STATE_HOLD
|
||||||
node.hold_job = " ".join(autohold_key)
|
node.hold_job = " ".join(autohold_key)
|
||||||
node.comment = reason
|
node.comment = reason
|
||||||
|
if node_hold_expiration:
|
||||||
|
node.hold_expiration = node_hold_expiration
|
||||||
self.sched.zk.storeNode(node)
|
self.sched.zk.storeNode(node)
|
||||||
|
|
||||||
# We remove the autohold when the number of nodes in hold
|
# We remove the autohold when the number of nodes in hold
|
||||||
|
@ -48,14 +48,16 @@ class RPCClient(object):
|
|||||||
self.log.debug("Job complete, success: %s" % (not job.failure))
|
self.log.debug("Job complete, success: %s" % (not job.failure))
|
||||||
return job
|
return job
|
||||||
|
|
||||||
def autohold(self, tenant, project, job, change, ref, reason, count):
|
def autohold(self, tenant, project, job, change, ref, reason, count,
|
||||||
|
node_hold_expiration=None):
|
||||||
data = {'tenant': tenant,
|
data = {'tenant': tenant,
|
||||||
'project': project,
|
'project': project,
|
||||||
'job': job,
|
'job': job,
|
||||||
'change': change,
|
'change': change,
|
||||||
'ref': ref,
|
'ref': ref,
|
||||||
'reason': reason,
|
'reason': reason,
|
||||||
'count': count}
|
'count': count,
|
||||||
|
'node_hold_expiration': node_hold_expiration}
|
||||||
return not self.submitJob('zuul:autohold', data).failure
|
return not self.submitJob('zuul:autohold', data).failure
|
||||||
|
|
||||||
def autohold_list(self):
|
def autohold_list(self):
|
||||||
|
@ -172,6 +172,7 @@ class RPCListener(object):
|
|||||||
return
|
return
|
||||||
|
|
||||||
params['count'] = args['count']
|
params['count'] = args['count']
|
||||||
|
params['node_hold_expiration'] = args['node_hold_expiration']
|
||||||
|
|
||||||
self.sched.autohold(**params)
|
self.sched.autohold(**params)
|
||||||
job.sendWorkComplete()
|
job.sendWorkComplete()
|
||||||
|
@ -439,14 +439,14 @@ class Scheduler(threading.Thread):
|
|||||||
# TODOv3(jeblair): reconfigure time should be per-tenant
|
# TODOv3(jeblair): reconfigure time should be per-tenant
|
||||||
|
|
||||||
def autohold(self, tenant_name, project_name, job_name, ref_filter,
|
def autohold(self, tenant_name, project_name, job_name, ref_filter,
|
||||||
reason, count):
|
reason, count, node_hold_expiration):
|
||||||
key = (tenant_name, project_name, job_name, ref_filter)
|
key = (tenant_name, project_name, job_name, ref_filter)
|
||||||
if count == 0 and key in self.autohold_requests:
|
if count == 0 and key in self.autohold_requests:
|
||||||
self.log.debug("Removing autohold for %s", key)
|
self.log.debug("Removing autohold for %s", key)
|
||||||
del self.autohold_requests[key]
|
del self.autohold_requests[key]
|
||||||
else:
|
else:
|
||||||
self.log.debug("Autohold requested for %s", key)
|
self.log.debug("Autohold requested for %s", key)
|
||||||
self.autohold_requests[key] = (count, reason)
|
self.autohold_requests[key] = (count, reason, node_hold_expiration)
|
||||||
|
|
||||||
def promote(self, tenant_name, pipeline_name, change_ids):
|
def promote(self, tenant_name, pipeline_name, change_ids):
|
||||||
event = PromoteEvent(tenant_name, pipeline_name, change_ids)
|
event = PromoteEvent(tenant_name, pipeline_name, change_ids)
|
||||||
|
Loading…
Reference in New Issue
Block a user