Support autoholding nodes for specific changes/refs

Add support for "scoped" autohold requests, making it possible to create requests that match a given ref filter and hold nodes for build that matches the filter. Introduce two new arguments to `zuul autohold`: --ref and --change, with --ref accepting regex that builds are matched against, and --change being a shortcut that creates a filter for the specific change. Change-Id: I801ba1af4b1bda46abff07791dd96828f2738621
2017-10-25 12:16:47 +02:00 · 2017-10-25 12:16:47 +02:00 · 37d5403629
parent 1bb9171c43
commit 37d5403629
9 changed files with 260 additions and 16 deletions
--- a/tests/base.py
+++ b/tests/base.py
@ -1629,6 +1629,10 @@ class FakeNodepool(object):
        nodeid = path.split("/")[-1]
        return nodeid
    def removeNode(self, node):
        path = self.NODE_ROOT + '/' + node["_oid"]
        self.client.delete(path, recursive=True)
    def addFailRequest(self, request):
        self.fail_requests.add(request['_oid'])
--- a/tests/unit/test_scheduler.py
+++ b/tests/unit/test_scheduler.py
@ -1507,7 +1507,7 @@ class TestScheduler(ZuulTestCase):
                                          self.gearman_server.port)
        self.addCleanup(client.shutdown)
        r = client.autohold('tenant-one', 'org/project', 'project-test2',
-                            "reason text", 1)
+                            "", "", "reason text", 1)
        self.assertTrue(r)
        # First check that successful jobs do not autohold
@ -1554,7 +1554,7 @@ class TestScheduler(ZuulTestCase):
            held_node['hold_job'],
            " ".join(['tenant-one',
                      'review.example.com/org/project',
-                      'project-test2'])
+                      'project-test2', '.*'])
        )
        self.assertEqual(held_node['comment'], "reason text")
@ -1574,13 +1574,151 @@ class TestScheduler(ZuulTestCase):
                held_nodes += 1
        self.assertEqual(held_nodes, 1)
    def _test_autohold_scoped(self, change_obj, change, ref):
        client = zuul.rpcclient.RPCClient('127.0.0.1',
                                          self.gearman_server.port)
        self.addCleanup(client.shutdown)
        # create two changes on the same project, and autohold request
        # for one of them.
        other = self.fake_gerrit.addFakeChange(
            'org/project', 'master', 'other'
        )
        r = client.autohold('tenant-one', 'org/project', 'project-test2',
                            str(change), ref, "reason text", 1)
        self.assertTrue(r)
        # First, check that an unrelated job does not trigger autohold, even
        # when it failed
        self.executor_server.failJob('project-test2', other)
        self.fake_gerrit.addEvent(other.getPatchsetCreatedEvent(1))
        self.waitUntilSettled()
        self.assertEqual(other.data['status'], 'NEW')
        self.assertEqual(other.reported, 1)
        # project-test2
        self.assertEqual(self.history[0].result, 'FAILURE')
        # Check nodepool for a held node
        held_node = None
        for node in self.fake_nodepool.getNodes():
            if node['state'] == zuul.model.STATE_HOLD:
                held_node = node
                break
        self.assertIsNone(held_node)
        # And then verify that failed job for the defined change
        # triggers the autohold
        self.executor_server.failJob('project-test2', change_obj)
        self.fake_gerrit.addEvent(change_obj.getPatchsetCreatedEvent(1))
        self.waitUntilSettled()
        self.assertEqual(change_obj.data['status'], 'NEW')
        self.assertEqual(change_obj.reported, 1)
        # project-test2
        self.assertEqual(self.history[1].result, 'FAILURE')
        # Check nodepool for a held node
        held_node = None
        for node in self.fake_nodepool.getNodes():
            if node['state'] == zuul.model.STATE_HOLD:
                held_node = node
                break
        self.assertIsNotNone(held_node)
        # Validate node has recorded the failed job
        if change != "":
            ref = "refs/changes/%s/%s/.*" % (
                str(change_obj.number)[-1:], str(change_obj.number)
            )
        self.assertEqual(
            held_node['hold_job'],
            " ".join(['tenant-one',
                      'review.example.com/org/project',
                      'project-test2', ref])
        )
        self.assertEqual(held_node['comment'], "reason text")
    @simple_layout('layouts/autohold.yaml')
    def test_autohold_change(self):
        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
        self._test_autohold_scoped(A, change=A.number, ref="")
    @simple_layout('layouts/autohold.yaml')
    def test_autohold_ref(self):
        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
        ref = A.data['currentPatchSet']['ref']
        self._test_autohold_scoped(A, change="", ref=ref)
    @simple_layout('layouts/autohold.yaml')
    def test_autohold_scoping(self):
        client = zuul.rpcclient.RPCClient('127.0.0.1',
                                          self.gearman_server.port)
        self.addCleanup(client.shutdown)
        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
        # create three autohold requests, scoped to job, change and
        # a specific ref
        change = str(A.number)
        ref = A.data['currentPatchSet']['ref']
        r1 = client.autohold('tenant-one', 'org/project', 'project-test2',
                             "", "", "reason text", 1)
        self.assertTrue(r1)
        r2 = client.autohold('tenant-one', 'org/project', 'project-test2',
                             change, "", "reason text", 1)
        self.assertTrue(r2)
        r3 = client.autohold('tenant-one', 'org/project', 'project-test2',
                             "", ref, "reason text", 1)
        self.assertTrue(r3)
        # Fail 3 jobs for the same change, and verify that the autohold
        # requests are fullfilled in the expected order: from the most
        # specific towards the most generic one.
        def _fail_job_and_verify_autohold_request(change_obj, ref_filter):
            self.executor_server.failJob('project-test2', change_obj)
            self.fake_gerrit.addEvent(change_obj.getPatchsetCreatedEvent(1))
            self.waitUntilSettled()
            # Check nodepool for a held node
            held_node = None
            for node in self.fake_nodepool.getNodes():
                if node['state'] == zuul.model.STATE_HOLD:
                    held_node = node
                    break
            self.assertIsNotNone(held_node)
            self.assertEqual(
                held_node['hold_job'],
                " ".join(['tenant-one',
                          'review.example.com/org/project',
                          'project-test2', ref_filter])
            )
            self.assertFalse(held_node['_lock'], "Node %s is locked" %
                             (node['_oid'],))
            self.fake_nodepool.removeNode(held_node)
        _fail_job_and_verify_autohold_request(A, ref)
        ref = "refs/changes/%s/%s/.*" % (str(change)[-1:], str(change))
        _fail_job_and_verify_autohold_request(A, ref)
        _fail_job_and_verify_autohold_request(A, ".*")
    @simple_layout('layouts/autohold.yaml')
    def test_autohold_ignores_aborted_jobs(self):
        client = zuul.rpcclient.RPCClient('127.0.0.1',
                                          self.gearman_server.port)
        self.addCleanup(client.shutdown)
        r = client.autohold('tenant-one', 'org/project', 'project-test2',
-                            "reason text", 1)
+                            "", "", "reason text", 1)
        self.assertTrue(r)
        self.executor_server.hold_jobs_in_build = True
@ -1624,7 +1762,7 @@ class TestScheduler(ZuulTestCase):
        self.addCleanup(client.shutdown)
        r = client.autohold('tenant-one', 'org/project', 'project-test2',
-                            "reason text", 1)
+                            "", "", "reason text", 1)
        self.assertTrue(r)
        autohold_requests = client.autohold_list()
@ -1633,11 +1771,12 @@ class TestScheduler(ZuulTestCase):
        # The single dict key should be a CSV string value
        key = list(autohold_requests.keys())[0]
-        tenant, project, job = key.split(',')
+        tenant, project, job, ref_filter = key.split(',')
        self.assertEqual('tenant-one', tenant)
        self.assertIn('org/project', project)
        self.assertEqual('project-test2', job)
        self.assertEqual(".*", ref_filter)
        # Note: the value is converted from set to list by json.
        self.assertEqual([1, "reason text"], autohold_requests[key])
--- a/zuul/cmd/client.py
+++ b/zuul/cmd/client.py
@ -51,6 +51,11 @@ class Client(zuul.cmd.ZuulApp):
                                  required=True)
        cmd_autohold.add_argument('--job', help='job name',
                                  required=True)
        cmd_autohold.add_argument('--change',
                                  help='specific change to hold nodes for',
                                  required=False, default='')
        cmd_autohold.add_argument('--ref', help='git ref to hold nodes for',
                                  required=False, default='')
        cmd_autohold.add_argument('--reason', help='reason for the hold',
                                  required=True)
        cmd_autohold.add_argument('--count',
@ -173,9 +178,15 @@ class Client(zuul.cmd.ZuulApp):
    def autohold(self):
        client = zuul.rpcclient.RPCClient(
            self.server, self.port, self.ssl_key, self.ssl_cert, self.ssl_ca)
        if self.args.change and self.args.ref:
            print("Change and ref can't be both used for the same request")
            return False
        r = client.autohold(tenant=self.args.tenant,
                            project=self.args.project,
                            job=self.args.job,
                            change=self.args.change,
                            ref=self.args.ref,
                            reason=self.args.reason,
                            count=self.args.count)
        return r
@ -190,14 +201,19 @@ class Client(zuul.cmd.ZuulApp):
            return True
        table = prettytable.PrettyTable(
-            field_names=['Tenant', 'Project', 'Job', 'Count', 'Reason'])
+            field_names=[
                'Tenant', 'Project', 'Job', 'Ref Filter', 'Count', 'Reason'
            ])
        for key, value in autohold_requests.items():
            # The key comes to us as a CSV string because json doesn't like
            # non-str keys.
-            tenant_name, project_name, job_name = key.split(',')
+            tenant_name, project_name, job_name, ref_filter = key.split(',')
            count, reason = value
-            table.add_row([tenant_name, project_name, job_name, count, reason])
+
            table.add_row([
                tenant_name, project_name, job_name, ref_filter, count, reason
            ])
        print(table)
        return True
--- a/zuul/driver/gerrit/gerritsource.py
+++ b/zuul/driver/gerrit/gerritsource.py
@ -138,6 +138,10 @@ class GerritSource(BaseSource):
        )
        return [f]
    def getRefForChange(self, change):
        partial = change[-2:]
        return "refs/changes/%s/%s/.*" % (partial, change)
 approval = vs.Schema({'username': str,
                      'email': str,
--- a/zuul/driver/git/gitsource.py
+++ b/zuul/driver/git/gitsource.py
@ -68,3 +68,6 @@ class GitSource(BaseSource):
    def getRejectFilters(self, config):
        return []
    def getRefForChange(self, change):
        raise NotImplemented()
--- a/zuul/driver/github/githubsource.py
+++ b/zuul/driver/github/githubsource.py
@ -144,6 +144,9 @@ class GithubSource(BaseSource):
        )
        return [f]
    def getRefForChange(self, change):
        return "refs/pull/%s/head" % change
 review = v.Schema({'username': str,
                   'email': str,
--- a/zuul/rpcclient.py
+++ b/zuul/rpcclient.py
@ -48,10 +48,12 @@ class RPCClient(object):
        self.log.debug("Job complete, success: %s" % (not job.failure))
        return job
-    def autohold(self, tenant, project, job, reason, count):
+    def autohold(self, tenant, project, job, change, ref, reason, count):
        data = {'tenant': tenant,
                'project': project,
                'job': job,
                'change': change,
                'ref': ref,
                'reason': reason,
                'count': count}
        return not self.submitJob('zuul:autohold', data).failure
--- a/zuul/rpclistener.py
+++ b/zuul/rpclistener.py
@ -150,7 +150,20 @@ class RPCListener(object):
            job.sendWorkException(error.encode('utf8'))
            return
        if args['change'] and args['ref']:
            job.sendWorkException("Change and ref can't be both used "
                                  "for the same request")
        if args['change']:
            # Convert change into ref based on zuul connection
            ref_filter = project.source.getRefForChange(args['change'])
        elif args['ref']:
            ref_filter = "%s" % args['ref']
        else:
            ref_filter = ".*"
        params['job_name'] = args['job']
        params['ref_filter'] = ref_filter
        params['reason'] = args['reason']
        if args['count'] < 0:
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@ -19,6 +19,7 @@ import json
 import logging
 import os
 import pickle
 import re
 import queue
 import socket
 import sys
@ -436,8 +437,9 @@ class Scheduler(threading.Thread):
        self.last_reconfigured = int(time.time())
        # TODOv3(jeblair): reconfigure time should be per-tenant
-    def autohold(self, tenant_name, project_name, job_name, reason, count):
+    def autohold(self, tenant_name, project_name, job_name, ref_filter,
-        key = (tenant_name, project_name, job_name)
+                 reason, count):
        key = (tenant_name, project_name, job_name, ref_filter)
        if count == 0 and key in self.autohold_requests:
            self.log.debug("Removing autohold for %s", key)
            del self.autohold_requests[key]
@ -973,6 +975,66 @@ class Scheduler(threading.Thread):
            self.log.exception("Exception estimating build time:")
        pipeline.manager.onBuildStarted(event.build)
    def _getAutoholdRequestKey(self, build):
        change = build.build_set.item.change
        autohold_key_base = (build.pipeline.layout.tenant.name,
                             change.project.canonical_name,
                             build.job.name)
        class Scope(object):
            """Enum defining a precedence/priority of autohold requests.
            Autohold requests for specific refs should be fulfilled first,
            before those for changes, and generic jobs.
            Matching algorithm goes over all existing autohold requests, and
            returns one with the highest number (in case of duplicated
            requests the last one wins).
            """
            NONE = 0
            JOB = 1
            CHANGE = 2
            REF = 3
        def autohold_key_base_issubset(base, request_key):
            """check whether the given key is a subset of the build key"""
            index = 0
            base_len = len(base)
            while index < base_len:
                if base[index] != request_key[index]:
                    return False
                index += 1
            return True
        # Do a partial match of the autohold key against all autohold
        # requests, ignoring the last element of the key (ref filter),
        # and finally do a regex match between ref filter from
        # the autohold request and the build's change ref to check
        # if it matches. Lastly, make sure that we match the most
        # specific autohold request by comparing "scopes"
        # of requests - the most specific is selected.
        autohold_key = None
        scope = Scope.NONE
        for request in self.autohold_requests:
            ref_filter = request[-1]
            if not autohold_key_base_issubset(autohold_key_base, request) \
                or not re.match(ref_filter, change.ref):
                continue
            if ref_filter == ".*":
                candidate_scope = Scope.JOB
            elif ref_filter.endswith(".*"):
                candidate_scope = Scope.CHANGE
            else:
                candidate_scope = Scope.REF
            if candidate_scope > scope:
                scope = candidate_scope
                autohold_key = request
        return autohold_key
    def _doBuildCompletedEvent(self, event):
        build = event.build
@ -981,11 +1043,9 @@ class Scheduler(threading.Thread):
        # the nodes to nodepool.
        try:
            nodeset = build.nodeset
-            autohold_key = (build.pipeline.layout.tenant.name,
+            autohold_key = self._getAutoholdRequestKey(build)
-                            build.build_set.item.change.project.canonical_name,
+
-                            build.job.name)
+            if (build.result == "FAILURE" and autohold_key):
            if (build.result == "FAILURE" and
                autohold_key in self.autohold_requests):
                # We explicitly only want to hold nodes for jobs if they have
                # failed and have an autohold request.
                try: