Fix loading_errors bug

This corrects an error
  Error in dynamic layout
  Traceback (most recent call last):
    File "/usr/local/lib/python3.8/site-packages/zuul/manager/__init__.py", line 625, in _loadDynamicLayout
      relevant_errors = self._findRelevantErrors(item,
    File "/usr/local/lib/python3.8/site-packages/zuul/manager/__init__.py", line 517, in _findRelevantErrors
      parent_layout.loading_errors.error_keys) or
  AttributeError: 'NoneType' object has no attribute 'loading_errors'

which can be caused when the following conditions are met:

* Tenant A is listed first.
* Tenant A holds projects A and B.
* Project B references a config object defined in project A.
* Tenant A loads without errors.
* Tenant B holds projects B and C.
* Tenant B has a standing configuration error due to the unknown
  reference to an object in project A from project B.
* Two or more changes to project C which cause dynamic
  configurations to be created are enqueued in a pipeline.
* The merge job for the second change finishes before the first.

This is because:

We cache configuration objects if they load without error in
their first tenant; that means that they can show up as errors in
later tenants, but as long as those other tenants aren't
proposing changes to that repo, it doesn't matter.  But it does
mean that every dynamic reconfiguration in this tenant will see
errors and will execute the code path that compares the new
dynamic configuration to the previous one to see if they are
relevant.

If a merge job for a dynamic config change arrives out of order,
we will compare it to the previous configuration to determine if
they are relevant, but since the previous layout had not been
calculated yet, the exception above was hit.

The solution is to indicate that the layout for the later change
is not ready until the layout for the previous change is.

Change-Id: Ibe1392a494d42b65080ab7e42f116db3869548ff
This commit is contained in:
James E. Blair 2020-05-05 16:02:02 -07:00
parent a655aee0f1
commit 319bbacfa1
10 changed files with 129 additions and 13 deletions

View File

@ -0,0 +1,2 @@
- hosts: all
tasks: []

View File

@ -0,0 +1,23 @@
- pipeline:
name: check
manager: independent
trigger:
gerrit:
- event: patchset-created
success:
gerrit:
Verified: 1
failure:
gerrit:
Verified: -1
- job:
name: base
parent: null
run: playbooks/job.yaml
- project:
name: ^.*
check:
jobs:
- base

View File

@ -0,0 +1 @@
test

View File

@ -0,0 +1 @@
test

View File

@ -0,0 +1,3 @@
- job:
name: child-job
parent: parent-job

View File

@ -0,0 +1 @@
test

View File

@ -0,0 +1,2 @@
- job:
name: parent-job

View File

@ -0,0 +1,19 @@
- tenant:
name: tenant-one
source:
gerrit:
config-projects:
- common-config
untrusted-projects:
- org/project2
- org/project3
- tenant:
name: tenant-two
source:
gerrit:
config-projects:
- common-config
untrusted-projects:
- org/project1
- org/project2

View File

@ -3358,6 +3358,80 @@ class TestBrokenConfig(ZuulTestCase):
"A should have failed the check pipeline")
class TestBrokenMultiTenantConfig(ZuulTestCase):
# Test we can deal with a broken multi-tenant config
tenant_config_file = 'config/broken-multi-tenant/main.yaml'
def test_loading_errors(self):
# This regression test came about when we discovered the following:
# * We cache configuration objects if they load without error
# in their first tenant; that means that they can show up as
# errors in later tenants, but as long as those other
# tenants aren't proposing changes to that repo (which is
# unlikely in this situation; this usually arises if the
# tenant just wants to use some foreign jobs), users won't
# be blocked by the error.
#
# * If a merge job for a dynamic config change arrives out of
# order, we will build the new configuration and if there
# are errors, we will compare it to the previous
# configuration to determine if they are relevant, but that
# caused an error since the previous layout had not been
# calculated yet. It's pretty hard to end up with
# irrelevant errors except by virtue of the first point
# above, which is why this test relies on a second tenant.
# This test has two tenants. The first loads project2, and
# project3 without errors and all config objects are cached.
# The second tenant loads only project1 and project2.
# Project2 references a job that is defined in project3, so
# the tenant loads with an error, but proceeds.
# Don't run any merge jobs, so we can run them out of order.
self.gearman_server.hold_merge_jobs_in_queue = True
# Create a first change which modifies the config (and
# therefore will require a merge job).
in_repo_conf = textwrap.dedent(
"""
- job: {'name': 'foo'}
""")
file_dict = {'.zuul.yaml': in_repo_conf}
A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A',
files=file_dict)
# Create a second change which also modifies the config.
B = self.fake_gerrit.addFakeChange('org/project1', 'master', 'B',
files=file_dict)
B.setDependsOn(A, 1)
self.fake_gerrit.addEvent(B.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
# There should be a merge job for each change.
self.assertEqual(len(self.scheds.first.sched.merger.jobs), 2)
jobs = [job for job in self.gearman_server.getQueue()
if job.name.startswith(b'merger:')]
# Release the second merge job.
jobs[-1].waiting = False
self.gearman_server.wakeConnections()
self.waitUntilSettled()
# At this point we should still be waiting on the first
# change's merge job.
self.assertHistory([])
# Proceed.
self.gearman_server.hold_merge_jobs_in_queue = False
self.gearman_server.release()
self.waitUntilSettled()
self.assertHistory([
dict(name='base', result='SUCCESS', changes='1,1 2,1'),
])
class TestProjectKeys(ZuulTestCase):
# Test that we can generate project keys

View File

@ -563,19 +563,6 @@ class PipelineManager(metaclass=ABCMeta):
zuul_event_id=None)
untrusted_errors = len(untrusted_layout.loading_errors) > 0
# TODO (jeblair): remove this section of extra verbose
# debug logging when we have resolved the loading_errors
# bug.
log.debug("Dynamic layout: trusted errors: %s layout: %s",
trusted_errors, trusted_layout)
if trusted_layout:
for err in trusted_layout.loading_errors.errors[:10]:
log.debug(err.error)
log.debug("Dynamic layout: untrusted errors: %s layout: %s",
untrusted_errors, untrusted_layout)
if untrusted_layout:
for err in untrusted_layout.loading_errors.errors[:10]:
log.debug(err.error)
# Configuration state handling switchboard. Intentionally verbose
# and repetetive to be exceptionally clear that we handle all
# possible cases correctly. Note we never return trusted_layout
@ -666,6 +653,9 @@ class PipelineManager(metaclass=ABCMeta):
def getLayout(self, item):
if item.item_ahead:
fallback_layout = item.item_ahead.layout
if fallback_layout is None:
# We're probably waiting on a merge job for the item ahead.
return None
else:
fallback_layout = item.pipeline.tenant.layout
if not item.change.updatesConfig(item.pipeline.tenant):