Fix loading_errors bug

This corrects an error Error in dynamic layout Traceback (most recent call last): File "/usr/local/lib/python3.8/site-packages/zuul/manager/__init__.py", line 625, in _loadDynamicLayout relevant_errors = self._findRelevantErrors(item, File "/usr/local/lib/python3.8/site-packages/zuul/manager/__init__.py", line 517, in _findRelevantErrors parent_layout.loading_errors.error_keys) or AttributeError: 'NoneType' object has no attribute 'loading_errors' which can be caused when the following conditions are met: * Tenant A is listed first. * Tenant A holds projects A and B. * Project B references a config object defined in project A. * Tenant A loads without errors. * Tenant B holds projects B and C. * Tenant B has a standing configuration error due to the unknown reference to an object in project A from project B. * Two or more changes to project C which cause dynamic configurations to be created are enqueued in a pipeline. * The merge job for the second change finishes before the first. This is because: We cache configuration objects if they load without error in their first tenant; that means that they can show up as errors in later tenants, but as long as those other tenants aren't proposing changes to that repo, it doesn't matter. But it does mean that every dynamic reconfiguration in this tenant will see errors and will execute the code path that compares the new dynamic configuration to the previous one to see if they are relevant. If a merge job for a dynamic config change arrives out of order, we will compare it to the previous configuration to determine if they are relevant, but since the previous layout had not been calculated yet, the exception above was hit. The solution is to indicate that the layout for the later change is not ready until the layout for the previous change is. Change-Id: Ibe1392a494d42b65080ab7e42f116db3869548ff
2020-05-05 16:02:02 -07:00 · 2020-05-05 16:02:02 -07:00 · 319bbacfa1
parent a655aee0f1
commit 319bbacfa1
10 changed files with 129 additions and 13 deletions
--- a/tests/fixtures/config/broken-multi-tenant/git/common-config/playbooks/job.yaml
+++ b/tests/fixtures/config/broken-multi-tenant/git/common-config/playbooks/job.yaml
@ -0,0 +1,2 @@
+- hosts: all
+  tasks: []
--- a/tests/fixtures/config/broken-multi-tenant/git/common-config/zuul.yaml
+++ b/tests/fixtures/config/broken-multi-tenant/git/common-config/zuul.yaml
@ -0,0 +1,23 @@
+- pipeline:
+    name: check
+    manager: independent
+    trigger:
+      gerrit:
+        - event: patchset-created
+    success:
+      gerrit:
+        Verified: 1
+    failure:
+      gerrit:
+        Verified: -1
+
+- job:
+    name: base
+    parent: null
+    run: playbooks/job.yaml
+
+- project:
+    name: ^.*
+    check:
+      jobs:
+        - base
--- a/tests/fixtures/config/broken-multi-tenant/git/org_project1/README
+++ b/tests/fixtures/config/broken-multi-tenant/git/org_project1/README
@ -0,0 +1 @@
+test
--- a/tests/fixtures/config/broken-multi-tenant/git/org_project2/README
+++ b/tests/fixtures/config/broken-multi-tenant/git/org_project2/README
@ -0,0 +1 @@
+test
--- a/tests/fixtures/config/broken-multi-tenant/git/org_project2/zuul.yaml
+++ b/tests/fixtures/config/broken-multi-tenant/git/org_project2/zuul.yaml
@ -0,0 +1,3 @@
+- job:
+    name: child-job
+    parent: parent-job
--- a/tests/fixtures/config/broken-multi-tenant/git/org_project3/README
+++ b/tests/fixtures/config/broken-multi-tenant/git/org_project3/README
@ -0,0 +1 @@
+test
--- a/tests/fixtures/config/broken-multi-tenant/git/org_project3/zuul.yaml
+++ b/tests/fixtures/config/broken-multi-tenant/git/org_project3/zuul.yaml
@ -0,0 +1,2 @@
+- job:
+    name: parent-job
--- a/tests/fixtures/config/broken-multi-tenant/main.yaml
+++ b/tests/fixtures/config/broken-multi-tenant/main.yaml
@ -0,0 +1,19 @@
+- tenant:
+    name: tenant-one
+    source:
+      gerrit:
+        config-projects:
+          - common-config
+        untrusted-projects:
+          - org/project2
+          - org/project3
+
+- tenant:
+    name: tenant-two
+    source:
+      gerrit:
+        config-projects:
+          - common-config
+        untrusted-projects:
+          - org/project1
+          - org/project2
--- a/tests/unit/test_v3.py
+++ b/tests/unit/test_v3.py
@ -3358,6 +3358,80 @@ class TestBrokenConfig(ZuulTestCase):
                      "A should have failed the check pipeline")


+class TestBrokenMultiTenantConfig(ZuulTestCase):
+    # Test we can deal with a broken multi-tenant config
+
+    tenant_config_file = 'config/broken-multi-tenant/main.yaml'
+
+    def test_loading_errors(self):
+        # This regression test came about when we discovered the following:
+
+        # * We cache configuration objects if they load without error
+        #   in their first tenant; that means that they can show up as
+        #   errors in later tenants, but as long as those other
+        #   tenants aren't proposing changes to that repo (which is
+        #   unlikely in this situation; this usually arises if the
+        #   tenant just wants to use some foreign jobs), users won't
+        #   be blocked by the error.
+        #
+        # * If a merge job for a dynamic config change arrives out of
+        #   order, we will build the new configuration and if there
+        #   are errors, we will compare it to the previous
+        #   configuration to determine if they are relevant, but that
+        #   caused an error since the previous layout had not been
+        #   calculated yet.  It's pretty hard to end up with
+        #   irrelevant errors except by virtue of the first point
+        #   above, which is why this test relies on a second tenant.
+
+        # This test has two tenants.  The first loads project2, and
+        # project3 without errors and all config objects are cached.
+        # The second tenant loads only project1 and project2.
+        # Project2 references a job that is defined in project3, so
+        # the tenant loads with an error, but proceeds.
+
+        # Don't run any merge jobs, so we can run them out of order.
+        self.gearman_server.hold_merge_jobs_in_queue = True
+
+        # Create a first change which modifies the config (and
+        # therefore will require a merge job).
+        in_repo_conf = textwrap.dedent(
+            """
+            - job: {'name': 'foo'}
+            """)
+        file_dict = {'.zuul.yaml': in_repo_conf}
+        A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A',
+                                           files=file_dict)
+
+        # Create a second change which also modifies the config.
+        B = self.fake_gerrit.addFakeChange('org/project1', 'master', 'B',
+                                           files=file_dict)
+        B.setDependsOn(A, 1)
+        self.fake_gerrit.addEvent(B.getPatchsetCreatedEvent(1))
+        self.waitUntilSettled()
+
+        # There should be a merge job for each change.
+        self.assertEqual(len(self.scheds.first.sched.merger.jobs), 2)
+
+        jobs = [job for job in self.gearman_server.getQueue()
+                if job.name.startswith(b'merger:')]
+        # Release the second merge job.
+        jobs[-1].waiting = False
+        self.gearman_server.wakeConnections()
+        self.waitUntilSettled()
+
+        # At this point we should still be waiting on the first
+        # change's merge job.
+        self.assertHistory([])
+
+        # Proceed.
+        self.gearman_server.hold_merge_jobs_in_queue = False
+        self.gearman_server.release()
+        self.waitUntilSettled()
+        self.assertHistory([
+            dict(name='base', result='SUCCESS', changes='1,1 2,1'),
+        ])
+
+
 class TestProjectKeys(ZuulTestCase):
    # Test that we can generate project keys

--- a/zuul/manager/init.py
+++ b/zuul/manager/init.py
@ -563,19 +563,6 @@ class PipelineManager(metaclass=ABCMeta):
                    zuul_event_id=None)
                untrusted_errors = len(untrusted_layout.loading_errors) > 0

-            # TODO (jeblair): remove this section of extra verbose
-            # debug logging when we have resolved the loading_errors
-            # bug.
-            log.debug("Dynamic layout: trusted errors: %s layout: %s",
-                      trusted_errors, trusted_layout)
-            if trusted_layout:
-                for err in trusted_layout.loading_errors.errors[:10]:
-                    log.debug(err.error)
-            log.debug("Dynamic layout: untrusted errors: %s layout: %s",
-                      untrusted_errors, untrusted_layout)
-            if untrusted_layout:
-                for err in untrusted_layout.loading_errors.errors[:10]:
-                    log.debug(err.error)
            # Configuration state handling switchboard. Intentionally verbose
            # and repetetive to be exceptionally clear that we handle all
            # possible cases correctly. Note we never return trusted_layout
@ -666,6 +653,9 @@ class PipelineManager(metaclass=ABCMeta):
    def getLayout(self, item):
        if item.item_ahead:
            fallback_layout = item.item_ahead.layout
+            if fallback_layout is None:
+                # We're probably waiting on a merge job for the item ahead.
+                return None
        else:
            fallback_layout = item.pipeline.tenant.layout
        if not item.change.updatesConfig(item.pipeline.tenant):