Fix zuul-web layout update on full/tenant-reconfigure

When an operator issues a full-reconfigure or tenant-reconfigure
command to the scheduler, the branch cache is cleared.  It will
be fully populated again during the subsequent full-reconfiguration,
or eventually populated more slowly over time after a
tenant-reconfiguration.  But during the time the branch cache is
clear, zuul-web will be unable to update layouts since it is incapable
of populating the branch cache itself.

This produces a short time period of errors during a full-reconfig
or a potentially long time period of errors after a tenant-reconfig.

To correct this, we now detect whether a layout update in zuul-web
is incomplete due to a branch cache error, and retry the update
later.  In the scheduler, we only clear the projects from the branch
cache that are affected by the tenants we are reloading (still all the
projects for a full-reconfigure).  This limits the time during which
zuul-web will be unable to update the layout to only the time that
the scheduler spends actually performing a reconfiguration.

(Note that in general, the error here is not because zuul-web is
loading the layout for the tenant that zuul-scheduler is reconfiguring,
but rather it is loading a layout for a tenant which has projects
in common with the tenant that is being reconfigured.)

Change-Id: I6794da4d2316f7df6ab302c74b3efb5df4ce461a
This commit is contained in:
James E. Blair 2022-05-14 13:52:11 -07:00
parent 3010b7cca0
commit 3974b5a9b5
7 changed files with 220 additions and 42 deletions

View File

@ -4389,9 +4389,16 @@ class SchedulerTestApp:
else:
self.sched.validateTenants(self.config, validate_tenants)
def fullReconfigure(self):
def fullReconfigure(self, command_socket=False):
try:
self.sched.reconfigure(self.config)
if command_socket:
command_socket = self.sched.config.get(
'scheduler', 'command_socket')
with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s:
s.connect(command_socket)
s.sendall('full-reconfigure\n'.encode('utf8'))
else:
self.sched.reconfigure(self.config)
except Exception:
self.log.exception("Reconfiguration failed:")
@ -4408,9 +4415,19 @@ class SchedulerTestApp:
except Exception:
self.log.exception("Reconfiguration failed:")
def tenantReconfigure(self, tenants):
def tenantReconfigure(self, tenants, command_socket=False):
try:
self.sched.reconfigure(self.config, smart=False, tenants=tenants)
if command_socket:
command_socket = self.sched.config.get(
'scheduler', 'command_socket')
args = json.dumps(tenants)
with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s:
s.connect(command_socket)
s.sendall(f'tenant-reconfigure {args}\n'.
encode('utf8'))
else:
self.sched.reconfigure(
self.config, smart=False, tenants=tenants)
except Exception:
self.log.exception("Reconfiguration failed:")

View File

@ -28,6 +28,7 @@ from unittest import skip
import requests
from zuul.lib.statsd import normalize_statsd_name
from zuul.zk.locks import tenant_write_lock
import zuul.web
from tests.base import ZuulTestCase, AnsibleZuulTestCase
@ -1227,6 +1228,94 @@ class TestWebStatusDisplayBranch(BaseTestWeb):
class TestWebMultiTenant(BaseTestWeb):
tenant_config_file = 'config/multi-tenant/main.yaml'
def test_tenant_reconfigure_command(self):
# The 'zuul-scheduler tenant-reconfigure' and full-reconfigure
# are used to correct problems, and as such they clear the
# branch cache. Until the reconfiguration is complete,
# zuul-web will be unable to load configuration for any tenant
# which has projects that have been cleared from the branch
# cache. This test verifies that we retry that operation
# after encountering missing branch errors.
sched = self.scheds.first.sched
web = self.web.web
# Don't perform any automatic config updates on zuul web so
# that we can control the sequencing.
self.web.web._system_config_running = False
self.web.web.system_config_cache_wake_event.set()
self.web.web.system_config_thread.join()
first_state = sched.tenant_layout_state.get('tenant-one')
self.assertEqual(first_state,
web.local_layout_state.get('tenant-one'))
data = self.get_url('api/tenant/tenant-one/jobs').json()
self.assertEqual(len(data), 4)
# Reconfigure tenant-one so that the layout state will be
# different and we can start a layout update in zuul-web
# later.
self.log.debug("Reconfigure tenant-one")
self.scheds.first.tenantReconfigure(['tenant-one'])
self.waitUntilSettled()
self.log.debug("Done reconfigure tenant-one")
second_state = sched.tenant_layout_state.get('tenant-one')
self.assertEqual(second_state,
sched.local_layout_state.get('tenant-one'))
self.assertEqual(first_state,
web.local_layout_state.get('tenant-one'))
self.log.debug("Grab write lock for tenant-two")
with tenant_write_lock(self.zk_client, 'tenant-two') as lock:
# Start a reconfiguration of tenant-two; allow it to
# proceed past the point that the branch cache is cleared
# and is waiting on the lock we hold.
self.scheds.first.tenantReconfigure(
['tenant-two'], command_socket=True)
for _ in iterate_timeout(30, "reconfiguration to start"):
if 'RECONFIG' in lock.contenders():
break
# Now that the branch cache is cleared as part of the
# tenant-two reconfiguration, allow zuul-web to
# reconfigure tenant-one. This should produce an error
# because of the missing branch cache.
self.log.debug("Web update layout 1")
self.web.web.updateSystemConfig()
self.assertFalse(self.web.web.updateLayout())
self.log.debug("Web update layout done")
self.assertEqual(second_state,
sched.local_layout_state.get('tenant-one'))
self.assertEqual(first_state,
web.local_layout_state.get('tenant-one'))
# Make sure we can still access tenant-one's config via
# zuul-web
data = self.get_url('api/tenant/tenant-one/jobs').json()
self.assertEqual(len(data), 4)
self.log.debug("Release write lock for tenant-two")
for _ in iterate_timeout(30, "reconfiguration to finish"):
if 'RECONFIG' not in lock.contenders():
break
self.log.debug("Web update layout 2")
self.web.web.updateSystemConfig()
self.web.web.updateLayout()
self.log.debug("Web update layout done")
# Depending on tenant order, we may need to run one more time
self.log.debug("Web update layout 3")
self.web.web.updateSystemConfig()
self.assertTrue(self.web.web.updateLayout())
self.log.debug("Web update layout done")
self.assertEqual(second_state,
sched.local_layout_state.get('tenant-one'))
self.assertEqual(second_state,
web.local_layout_state.get('tenant-one'))
data = self.get_url('api/tenant/tenant-one/jobs').json()
self.assertEqual(len(data), 4)
def test_web_labels_allowed_list(self):
labels = ["tenant-one-label", "fake", "tenant-two-label"]
self.fake_nodepool.registerLauncher(labels, "FakeLauncher2")

View File

@ -25,6 +25,7 @@ import voluptuous as vs
from zuul import change_matcher
from zuul import model
from zuul.connection import ReadOnlyBranchCacheError
from zuul.lib import yamlutil as yaml
import zuul.manager.dependent
import zuul.manager.independent
@ -233,6 +234,8 @@ def project_configuration_exceptions(context, accumulator):
yield
except ConfigurationSyntaxError:
raise
except ReadOnlyBranchCacheError:
raise
except Exception as e:
intro = textwrap.fill(textwrap.dedent("""\
Zuul encountered an error while accessing the repo {repo}. The error

View File

@ -21,6 +21,10 @@ from zuul.lib.logutil import get_annotated_logger
from zuul.model import Project
class ReadOnlyBranchCacheError(RuntimeError):
pass
class BaseConnection(object, metaclass=abc.ABCMeta):
"""Base class for connections.
@ -241,15 +245,35 @@ class ZKBranchCacheMixin:
# Handle the case where tenant validation doesn't use the cache
branches = None
if branches is not None:
if branches:
return sorted(branches)
if self.read_only:
if branches is None:
# A scheduler hasn't attempted to fetch them yet
raise ReadOnlyBranchCacheError(
"Will not fetch project branches as read-only is set")
# A scheduler has previously attempted a fetch, but got
# the empty list due to an error; we can't retry since
# we're read-only
raise RuntimeError(
"Will not fetch project branches as read-only is set")
# We need to perform a query
branches = self._fetchProjectBranches(project, exclude_unprotected)
try:
branches = self._fetchProjectBranches(project, exclude_unprotected)
except Exception:
# We weren't able to get the branches. We need to tell
# future schedulers to try again but tell zuul-web that we
# tried and failed. Set the branches to the empty list to
# indicate that we have performed a fetch and retrieved no
# data. Any time we encounter the empty list in the
# cache, we will try again (since it is not reasonable to
# have a project with no branches).
if self._branch_cache:
self._branch_cache.setProjectBranches(
project.name, exclude_unprotected, [])
raise
self.log.info("Got branches for %s" % project.name)
if self._branch_cache:
@ -315,14 +339,14 @@ class ZKBranchCacheMixin:
# again.
event.branch_protected = True
def clearBranchCache(self):
def clearBranchCache(self, projects=None):
"""Clear the branch cache
In case the branch cache gets out of sync with the source,
this method can be called to clear it and force querying the
source the next time the cache is used.
"""
self._branch_cache.clear()
self._branch_cache.clear(projects)
class ZKChangeCacheMixin:

View File

@ -16,6 +16,7 @@
# License for the specific language governing permissions and limitations
# under the License.
import itertools
import logging
import socket
import sys
@ -1365,10 +1366,27 @@ class Scheduler(threading.Thread):
# Consider caches valid if the cache ltime >= event ltime
min_ltimes = defaultdict(
lambda: defaultdict(lambda: event.zuul_event_ltime))
# Invalidate the branch cache for all connections
# Invalidate the branch cache
for connection in self.connections.connections.values():
if hasattr(connection, 'clearBranchCache'):
connection.clearBranchCache()
if event.tenants:
# Only clear the projects used by this
# tenant (zuul-web won't be able to load
# any tenants that we don't immediately
# reconfigure after clearing)
for tenant_name in event.tenants:
projects = [
tpc.project.name for tpc in
itertools.chain(
self.abide.getConfigTPCs(tenant_name),
self.abide.getUntrustedTPCs(
tenant_name))
]
connection.clearBranchCache(projects)
else:
# Clear all projects since we're reloading
# all tenants.
connection.clearBranchCache()
ltime = self.zk_client.getCurrentLtime()
# Consider the branch cache valid only after we
# cleared it

View File

@ -36,7 +36,7 @@ import prometheus_client
import zuul.executor.common
from zuul import exceptions
from zuul.configloader import ConfigLoader
from zuul.connection import BaseConnection
from zuul.connection import BaseConnection, ReadOnlyBranchCacheError
import zuul.lib.repl
from zuul.lib import commandsocket, encryption, streamer_utils
from zuul.lib.ansible import AnsibleManager
@ -2048,7 +2048,11 @@ class ZuulWeb(object):
if not self._system_config_running:
return
self.updateSystemConfig()
self.updateLayout()
if not self.updateLayout():
# Branch cache errors with at least one tenant,
# try again.
time.sleep(10)
self.system_config_cache_wake_event.set()
except Exception:
self.log.exception("Exception while updating system config")
@ -2084,6 +2088,7 @@ class ZuulWeb(object):
tenant_names = set(self.abide.tenants)
tenant_names.update(self.unparsed_abide.tenants.keys())
success = True
for tenant_name in tenant_names:
# Reload the tenant if the layout changed.
if (self.local_layout_state.get(tenant_name)
@ -2091,31 +2096,48 @@ class ZuulWeb(object):
continue
self.log.debug("Reloading tenant %s", tenant_name)
with tenant_read_lock(self.zk_client, tenant_name):
layout_state = self.tenant_layout_state.get(tenant_name)
layout_uuid = layout_state and layout_state.uuid
if layout_state:
min_ltimes = self.tenant_layout_state.getMinLtimes(
layout_state)
branch_cache_min_ltimes = (
layout_state.branch_cache_min_ltimes)
else:
# Consider all project branch caches valid if
# we don't have a layout state.
min_ltimes = defaultdict(
lambda: defaultdict(lambda: -1))
branch_cache_min_ltimes = defaultdict(lambda: -1)
# The tenant will be stored in self.abide.tenants after
# it was loaded.
tenant = loader.loadTenant(
self.abide, tenant_name, self.ansible_manager,
self.unparsed_abide, min_ltimes=min_ltimes,
layout_uuid=layout_uuid,
branch_cache_min_ltimes=branch_cache_min_ltimes)
if tenant is not None:
self.local_layout_state[tenant_name] = layout_state
else:
with suppress(KeyError):
del self.local_layout_state[tenant_name]
try:
self._updateTenantLayout(loader, tenant_name)
except ReadOnlyBranchCacheError:
self.log.info(
"Unable to update layout due to incomplete branch "
"cache, possibly due to in-progress tenant "
"reconfiguration; will retry")
success = False
self.log.debug("Done updating layout state")
return success
def _updateTenantLayout(self, loader, tenant_name):
# Reload the tenant if the layout changed.
if (self.local_layout_state.get(tenant_name)
== self.tenant_layout_state.get(tenant_name)):
return
self.log.debug("Reloading tenant %s", tenant_name)
with tenant_read_lock(self.zk_client, tenant_name):
layout_state = self.tenant_layout_state.get(tenant_name)
layout_uuid = layout_state and layout_state.uuid
if layout_state:
min_ltimes = self.tenant_layout_state.getMinLtimes(
layout_state)
branch_cache_min_ltimes = (
layout_state.branch_cache_min_ltimes)
else:
# Consider all project branch caches valid if
# we don't have a layout state.
min_ltimes = defaultdict(
lambda: defaultdict(lambda: -1))
branch_cache_min_ltimes = defaultdict(lambda: -1)
# The tenant will be stored in self.abide.tenants after
# it was loaded.
tenant = loader.loadTenant(
self.abide, tenant_name, self.ansible_manager,
self.unparsed_abide, min_ltimes=min_ltimes,
layout_uuid=layout_uuid,
branch_cache_min_ltimes=branch_cache_min_ltimes)
if tenant is not None:
self.local_layout_state[tenant_name] = layout_state
else:
with suppress(KeyError):
del self.local_layout_state[tenant_name]

View File

@ -104,12 +104,17 @@ class BranchCache:
self.cache = BranchCacheZKObject.new(
self.zk_context, _path=data_path)
def clear(self):
def clear(self, projects=None):
"""Clear the cache"""
with locked(self.wlock):
with self.cache.activeContext(self.zk_context):
self.cache.protected.clear()
self.cache.remainder.clear()
if projects is None:
self.cache.protected.clear()
self.cache.remainder.clear()
else:
for p in projects:
self.cache.protected.pop(p, None)
self.cache.remainder.pop(p, None)
def getProjectBranches(self, project_name, exclude_unprotected,
min_ltime=-1):