Add auto-hold feature

This adds a new table and series of commands to manipulate it
in which an operator may indicate that nodes which have run failed
instances of specified jobs should automatically be held.

Change-Id: I69b00fbdeed4fba086a54f051bbb51384ea26a70
This commit is contained in:
James E. Blair 2016-06-22 12:12:15 -07:00
parent 32992cd86d
commit 6da857c0ae
6 changed files with 207 additions and 0 deletions

View File

@ -147,6 +147,28 @@ alien-image-list
.. program-output:: nodepool alien-image-list --help
:nostderr:
In the case that a job is randomly failing for an unknown cause, it
may be necessary to instruct nodepool to automatically hold a node on
which that job has failed. To do so, use the the ``job-create``
command to specify the job name and how many failed nodes should be
held. When debugging is complete, use ''job-delete'' to disable the
feature.
job-create
^^^^^^^^^^
.. program-output:: nodepool job-create --help
:nostderr:
job-list
^^^^^^^^
.. program-output:: nodepool job-list --help
:nostderr:
job-delete
^^^^^^^^^^
.. program-output:: nodepool job-delete --help
:nostderr:
Removing a Provider
===================

View File

@ -144,6 +144,23 @@ class NodePoolCmd(object):
help='Validate configuration file')
cmd_config_validate.set_defaults(func=self.config_validate)
cmd_job_list = subparsers.add_parser('job-list', help='list jobs')
cmd_job_list.set_defaults(func=self.job_list)
cmd_job_create = subparsers.add_parser('job-create', help='create job')
cmd_job_create.add_argument(
'name',
help='job name')
cmd_job_create.add_argument('--hold-on-failure',
help='number of nodes to hold when this job fails')
cmd_job_create.set_defaults(func=self.job_create)
cmd_job_delete = subparsers.add_parser(
'job-delete',
help='delete job')
cmd_job_delete.set_defaults(func=self.job_delete)
cmd_job_delete.add_argument('id', help='job id')
self.args = parser.parse_args()
def setup_logging(self):
@ -374,6 +391,28 @@ class NodePoolCmd(object):
log.info("Configuation validation complete")
#TODO(asselin,yolanda): add validation of secure.conf
def job_list(self):
t = PrettyTable(["ID", "Name", "Hold on Failure"])
t.align = 'l'
with self.pool.getDB().getSession() as session:
for job in session.getJobs():
t.add_row([job.id, job.name, job.hold_on_failure])
print t
def job_create(self):
with self.pool.getDB().getSession() as session:
session.createJob(self.args.name,
hold_on_failure=self.args.hold_on_failure)
self.job_list()
def job_delete(self):
with self.pool.getDB().getSession() as session:
job = session.getJob(self.args.id)
if not job:
print "Job %s not found." % self.args.id
else:
job.delete()
def _wait_for_threads(self, threads):
for t in threads:
if t:

View File

@ -126,6 +126,15 @@ subnode_table = Table(
Column('state_time', Integer),
mysql_engine='InnoDB',
)
job_table = Table(
'job', metadata,
Column('id', Integer, primary_key=True),
# The name of the job
Column('name', String(255), index=True),
# Automatically hold up to this number of nodes that fail this job
Column('hold_on_failure', Integer),
mysql_engine='InnoDB',
)
class DibImage(object):
@ -249,6 +258,20 @@ class SubNode(object):
session.commit()
class Job(object):
def __init__(self, name=None, hold_on_failure=0):
self.name = name
self.hold_on_failure = hold_on_failure
def delete(self):
session = Session.object_session(self)
session.delete(self)
session.commit()
mapper(Job, job_table)
mapper(SubNode, subnode_table,
properties=dict(_state=subnode_table.c.state))
@ -460,3 +483,24 @@ class NodeDatabaseSession(object):
if not nodes:
return None
return nodes[0]
def getJob(self, id):
jobs = self.session().query(Job).filter_by(id=id).all()
if not jobs:
return None
return jobs[0]
def getJobByName(self, name):
jobs = self.session().query(Job).filter_by(name=name).all()
if not jobs:
return None
return jobs[0]
def getJobs(self):
return self.session().query(Job).all()
def createJob(self, *args, **kwargs):
new = Job(*args, **kwargs)
self.session().add(new)
self.commit()
return new

View File

@ -107,6 +107,24 @@ class NodeCompleteThread(threading.Thread):
node.id)
return
nodepool_job = session.getJobByName(self.jobname)
if (nodepool_job and nodepool_job.hold_on_failure and
self.result != 'SUCCESS'):
held_nodes = session.getNodes(state=nodedb.HOLD)
held_nodes = [n for n in held_nodes if self.jobname in n.comment]
if len(held_nodes) >= nodepool_job.hold_on_failure:
self.log.info("Node id: %s has failed %s but %s nodes "
"are already held for that job" % (
node.id, self.jobname, len(held_nodes)))
else:
node.state = nodedb.HOLD
node.comment = "Automatically held after failing %s" % (
self.jobname,)
self.log.info("Node id: %s failed %s, automatically holding" % (
node.id, self.jobname))
self.nodepool.updateStats(session, node.provider_name)
return
target = self.nodepool.config.targets[node.target_name]
if self.jobname == target.jenkins_test_job:
self.log.debug("Test job for node id: %s complete, result: %s" %

View File

@ -257,3 +257,20 @@ class TestNodepoolCMD(tests.DBTestCase):
self.patch_argv("-c", configfile, "image-build", "fake-dib-diskimage")
nodepoolcmd.main()
self.assert_listed(configfile, ['dib-image-list'], 4, 'ready', 1)
def test_job_create(self):
configfile = self.setup_config('node.yaml')
self.patch_argv("-c", configfile, "job-create", "fake-job",
"--hold-on-failure", "1")
nodepoolcmd.main()
self.assert_listed(configfile, ['job-list'], 2, 1, 1)
def test_job_delete(self):
configfile = self.setup_config('node.yaml')
self.patch_argv("-c", configfile, "job-create", "fake-job",
"--hold-on-failure", "1")
nodepoolcmd.main()
self.assert_listed(configfile, ['job-list'], 2, 1, 1)
self.patch_argv("-c", configfile, "job-delete", "1")
nodepoolcmd.main()
self.assert_listed(configfile, ['job-list'], 0, 1, 0)

View File

@ -625,6 +625,73 @@ class TestNodepool(tests.DBTestCase):
node = session.getNode(1)
self.assertEqual(node, None)
def _test_job_auto_hold(self, result):
configfile = self.setup_config('node.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.start()
self.waitForImage(pool, 'fake-provider', 'fake-image')
self.waitForNodes(pool)
with pool.getDB().getSession() as session:
session.createJob('fake-job', hold_on_failure=1)
msg_obj = {'name': 'fake-job',
'build': {'node_name': 'fake-label-fake-provider-1',
'status': result}}
json_string = json.dumps(msg_obj)
# Don't delay when deleting.
self.useFixture(fixtures.MonkeyPatch(
'nodepool.nodepool.DELETE_DELAY',
0))
handler = nodepool.nodepool.NodeUpdateListener(pool,
'tcp://localhost:8881')
handler.handleEvent('onFinalized', json_string)
self.wait_for_threads()
return pool
def test_job_auto_hold_success(self):
"""Test that a successful job does not hold a node"""
pool = self._test_job_auto_hold('SUCCESS')
with pool.getDB().getSession() as session:
node = session.getNode(1)
self.assertIsNone(node)
def test_job_auto_hold_failure(self):
"""Test that a failed job automatically holds a node"""
pool = self._test_job_auto_hold('FAILURE')
with pool.getDB().getSession() as session:
node = session.getNode(1)
self.assertEqual(node.state, nodedb.HOLD)
def test_job_auto_hold_failure_max(self):
"""Test that a failed job automatically holds only one node"""
pool = self._test_job_auto_hold('FAILURE')
with pool.getDB().getSession() as session:
node = session.getNode(1)
self.assertEqual(node.state, nodedb.HOLD)
# Wait for a replacement node
self.waitForNodes(pool)
with pool.getDB().getSession() as session:
node = session.getNode(2)
self.assertEqual(node.state, nodedb.READY)
# Fail the job again
msg_obj = {'name': 'fake-job',
'build': {'node_name': 'fake-label-fake-provider-2',
'status': 'FAILURE'}}
json_string = json.dumps(msg_obj)
handler = nodepool.nodepool.NodeUpdateListener(pool,
'tcp://localhost:8881')
handler.handleEvent('onFinalized', json_string)
self.wait_for_threads()
# Ensure that the second node was deleted
with pool.getDB().getSession() as session:
node = session.getNode(2)
self.assertEqual(node, None)
class TestGearClient(tests.DBTestCase):
def test_wait_for_completion(self):