Fix zuul-web layout update on full/tenant-reconfigure
When an operator issues a full-reconfigure or tenant-reconfigure command to the scheduler, the branch cache is cleared. It will be fully populated again during the subsequent full-reconfiguration, or eventually populated more slowly over time after a tenant-reconfiguration. But during the time the branch cache is clear, zuul-web will be unable to update layouts since it is incapable of populating the branch cache itself. This produces a short time period of errors during a full-reconfig or a potentially long time period of errors after a tenant-reconfig. To correct this, we now detect whether a layout update in zuul-web is incomplete due to a branch cache error, and retry the update later. In the scheduler, we only clear the projects from the branch cache that are affected by the tenants we are reloading (still all the projects for a full-reconfigure). This limits the time during which zuul-web will be unable to update the layout to only the time that the scheduler spends actually performing a reconfiguration. (Note that in general, the error here is not because zuul-web is loading the layout for the tenant that zuul-scheduler is reconfiguring, but rather it is loading a layout for a tenant which has projects in common with the tenant that is being reconfigured.) Change-Id: I6794da4d2316f7df6ab302c74b3efb5df4ce461a
This commit is contained in:
parent
3010b7cca0
commit
3974b5a9b5
|
@ -4389,9 +4389,16 @@ class SchedulerTestApp:
|
|||
else:
|
||||
self.sched.validateTenants(self.config, validate_tenants)
|
||||
|
||||
def fullReconfigure(self):
|
||||
def fullReconfigure(self, command_socket=False):
|
||||
try:
|
||||
self.sched.reconfigure(self.config)
|
||||
if command_socket:
|
||||
command_socket = self.sched.config.get(
|
||||
'scheduler', 'command_socket')
|
||||
with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s:
|
||||
s.connect(command_socket)
|
||||
s.sendall('full-reconfigure\n'.encode('utf8'))
|
||||
else:
|
||||
self.sched.reconfigure(self.config)
|
||||
except Exception:
|
||||
self.log.exception("Reconfiguration failed:")
|
||||
|
||||
|
@ -4408,9 +4415,19 @@ class SchedulerTestApp:
|
|||
except Exception:
|
||||
self.log.exception("Reconfiguration failed:")
|
||||
|
||||
def tenantReconfigure(self, tenants):
|
||||
def tenantReconfigure(self, tenants, command_socket=False):
|
||||
try:
|
||||
self.sched.reconfigure(self.config, smart=False, tenants=tenants)
|
||||
if command_socket:
|
||||
command_socket = self.sched.config.get(
|
||||
'scheduler', 'command_socket')
|
||||
args = json.dumps(tenants)
|
||||
with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s:
|
||||
s.connect(command_socket)
|
||||
s.sendall(f'tenant-reconfigure {args}\n'.
|
||||
encode('utf8'))
|
||||
else:
|
||||
self.sched.reconfigure(
|
||||
self.config, smart=False, tenants=tenants)
|
||||
except Exception:
|
||||
self.log.exception("Reconfiguration failed:")
|
||||
|
||||
|
|
|
@ -28,6 +28,7 @@ from unittest import skip
|
|||
import requests
|
||||
|
||||
from zuul.lib.statsd import normalize_statsd_name
|
||||
from zuul.zk.locks import tenant_write_lock
|
||||
import zuul.web
|
||||
|
||||
from tests.base import ZuulTestCase, AnsibleZuulTestCase
|
||||
|
@ -1227,6 +1228,94 @@ class TestWebStatusDisplayBranch(BaseTestWeb):
|
|||
class TestWebMultiTenant(BaseTestWeb):
|
||||
tenant_config_file = 'config/multi-tenant/main.yaml'
|
||||
|
||||
def test_tenant_reconfigure_command(self):
|
||||
# The 'zuul-scheduler tenant-reconfigure' and full-reconfigure
|
||||
# are used to correct problems, and as such they clear the
|
||||
# branch cache. Until the reconfiguration is complete,
|
||||
# zuul-web will be unable to load configuration for any tenant
|
||||
# which has projects that have been cleared from the branch
|
||||
# cache. This test verifies that we retry that operation
|
||||
# after encountering missing branch errors.
|
||||
sched = self.scheds.first.sched
|
||||
web = self.web.web
|
||||
# Don't perform any automatic config updates on zuul web so
|
||||
# that we can control the sequencing.
|
||||
self.web.web._system_config_running = False
|
||||
self.web.web.system_config_cache_wake_event.set()
|
||||
self.web.web.system_config_thread.join()
|
||||
|
||||
first_state = sched.tenant_layout_state.get('tenant-one')
|
||||
self.assertEqual(first_state,
|
||||
web.local_layout_state.get('tenant-one'))
|
||||
|
||||
data = self.get_url('api/tenant/tenant-one/jobs').json()
|
||||
self.assertEqual(len(data), 4)
|
||||
|
||||
# Reconfigure tenant-one so that the layout state will be
|
||||
# different and we can start a layout update in zuul-web
|
||||
# later.
|
||||
self.log.debug("Reconfigure tenant-one")
|
||||
self.scheds.first.tenantReconfigure(['tenant-one'])
|
||||
self.waitUntilSettled()
|
||||
self.log.debug("Done reconfigure tenant-one")
|
||||
|
||||
second_state = sched.tenant_layout_state.get('tenant-one')
|
||||
self.assertEqual(second_state,
|
||||
sched.local_layout_state.get('tenant-one'))
|
||||
self.assertEqual(first_state,
|
||||
web.local_layout_state.get('tenant-one'))
|
||||
|
||||
self.log.debug("Grab write lock for tenant-two")
|
||||
with tenant_write_lock(self.zk_client, 'tenant-two') as lock:
|
||||
# Start a reconfiguration of tenant-two; allow it to
|
||||
# proceed past the point that the branch cache is cleared
|
||||
# and is waiting on the lock we hold.
|
||||
self.scheds.first.tenantReconfigure(
|
||||
['tenant-two'], command_socket=True)
|
||||
for _ in iterate_timeout(30, "reconfiguration to start"):
|
||||
if 'RECONFIG' in lock.contenders():
|
||||
break
|
||||
# Now that the branch cache is cleared as part of the
|
||||
# tenant-two reconfiguration, allow zuul-web to
|
||||
# reconfigure tenant-one. This should produce an error
|
||||
# because of the missing branch cache.
|
||||
self.log.debug("Web update layout 1")
|
||||
self.web.web.updateSystemConfig()
|
||||
self.assertFalse(self.web.web.updateLayout())
|
||||
self.log.debug("Web update layout done")
|
||||
|
||||
self.assertEqual(second_state,
|
||||
sched.local_layout_state.get('tenant-one'))
|
||||
self.assertEqual(first_state,
|
||||
web.local_layout_state.get('tenant-one'))
|
||||
|
||||
# Make sure we can still access tenant-one's config via
|
||||
# zuul-web
|
||||
data = self.get_url('api/tenant/tenant-one/jobs').json()
|
||||
self.assertEqual(len(data), 4)
|
||||
self.log.debug("Release write lock for tenant-two")
|
||||
for _ in iterate_timeout(30, "reconfiguration to finish"):
|
||||
if 'RECONFIG' not in lock.contenders():
|
||||
break
|
||||
|
||||
self.log.debug("Web update layout 2")
|
||||
self.web.web.updateSystemConfig()
|
||||
self.web.web.updateLayout()
|
||||
self.log.debug("Web update layout done")
|
||||
|
||||
# Depending on tenant order, we may need to run one more time
|
||||
self.log.debug("Web update layout 3")
|
||||
self.web.web.updateSystemConfig()
|
||||
self.assertTrue(self.web.web.updateLayout())
|
||||
self.log.debug("Web update layout done")
|
||||
|
||||
self.assertEqual(second_state,
|
||||
sched.local_layout_state.get('tenant-one'))
|
||||
self.assertEqual(second_state,
|
||||
web.local_layout_state.get('tenant-one'))
|
||||
data = self.get_url('api/tenant/tenant-one/jobs').json()
|
||||
self.assertEqual(len(data), 4)
|
||||
|
||||
def test_web_labels_allowed_list(self):
|
||||
labels = ["tenant-one-label", "fake", "tenant-two-label"]
|
||||
self.fake_nodepool.registerLauncher(labels, "FakeLauncher2")
|
||||
|
|
|
@ -25,6 +25,7 @@ import voluptuous as vs
|
|||
|
||||
from zuul import change_matcher
|
||||
from zuul import model
|
||||
from zuul.connection import ReadOnlyBranchCacheError
|
||||
from zuul.lib import yamlutil as yaml
|
||||
import zuul.manager.dependent
|
||||
import zuul.manager.independent
|
||||
|
@ -233,6 +234,8 @@ def project_configuration_exceptions(context, accumulator):
|
|||
yield
|
||||
except ConfigurationSyntaxError:
|
||||
raise
|
||||
except ReadOnlyBranchCacheError:
|
||||
raise
|
||||
except Exception as e:
|
||||
intro = textwrap.fill(textwrap.dedent("""\
|
||||
Zuul encountered an error while accessing the repo {repo}. The error
|
||||
|
|
|
@ -21,6 +21,10 @@ from zuul.lib.logutil import get_annotated_logger
|
|||
from zuul.model import Project
|
||||
|
||||
|
||||
class ReadOnlyBranchCacheError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
class BaseConnection(object, metaclass=abc.ABCMeta):
|
||||
"""Base class for connections.
|
||||
|
||||
|
@ -241,15 +245,35 @@ class ZKBranchCacheMixin:
|
|||
# Handle the case where tenant validation doesn't use the cache
|
||||
branches = None
|
||||
|
||||
if branches is not None:
|
||||
if branches:
|
||||
return sorted(branches)
|
||||
|
||||
if self.read_only:
|
||||
if branches is None:
|
||||
# A scheduler hasn't attempted to fetch them yet
|
||||
raise ReadOnlyBranchCacheError(
|
||||
"Will not fetch project branches as read-only is set")
|
||||
# A scheduler has previously attempted a fetch, but got
|
||||
# the empty list due to an error; we can't retry since
|
||||
# we're read-only
|
||||
raise RuntimeError(
|
||||
"Will not fetch project branches as read-only is set")
|
||||
|
||||
# We need to perform a query
|
||||
branches = self._fetchProjectBranches(project, exclude_unprotected)
|
||||
try:
|
||||
branches = self._fetchProjectBranches(project, exclude_unprotected)
|
||||
except Exception:
|
||||
# We weren't able to get the branches. We need to tell
|
||||
# future schedulers to try again but tell zuul-web that we
|
||||
# tried and failed. Set the branches to the empty list to
|
||||
# indicate that we have performed a fetch and retrieved no
|
||||
# data. Any time we encounter the empty list in the
|
||||
# cache, we will try again (since it is not reasonable to
|
||||
# have a project with no branches).
|
||||
if self._branch_cache:
|
||||
self._branch_cache.setProjectBranches(
|
||||
project.name, exclude_unprotected, [])
|
||||
raise
|
||||
self.log.info("Got branches for %s" % project.name)
|
||||
|
||||
if self._branch_cache:
|
||||
|
@ -315,14 +339,14 @@ class ZKBranchCacheMixin:
|
|||
# again.
|
||||
event.branch_protected = True
|
||||
|
||||
def clearBranchCache(self):
|
||||
def clearBranchCache(self, projects=None):
|
||||
"""Clear the branch cache
|
||||
|
||||
In case the branch cache gets out of sync with the source,
|
||||
this method can be called to clear it and force querying the
|
||||
source the next time the cache is used.
|
||||
"""
|
||||
self._branch_cache.clear()
|
||||
self._branch_cache.clear(projects)
|
||||
|
||||
|
||||
class ZKChangeCacheMixin:
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
import socket
|
||||
import sys
|
||||
|
@ -1365,10 +1366,27 @@ class Scheduler(threading.Thread):
|
|||
# Consider caches valid if the cache ltime >= event ltime
|
||||
min_ltimes = defaultdict(
|
||||
lambda: defaultdict(lambda: event.zuul_event_ltime))
|
||||
# Invalidate the branch cache for all connections
|
||||
# Invalidate the branch cache
|
||||
for connection in self.connections.connections.values():
|
||||
if hasattr(connection, 'clearBranchCache'):
|
||||
connection.clearBranchCache()
|
||||
if event.tenants:
|
||||
# Only clear the projects used by this
|
||||
# tenant (zuul-web won't be able to load
|
||||
# any tenants that we don't immediately
|
||||
# reconfigure after clearing)
|
||||
for tenant_name in event.tenants:
|
||||
projects = [
|
||||
tpc.project.name for tpc in
|
||||
itertools.chain(
|
||||
self.abide.getConfigTPCs(tenant_name),
|
||||
self.abide.getUntrustedTPCs(
|
||||
tenant_name))
|
||||
]
|
||||
connection.clearBranchCache(projects)
|
||||
else:
|
||||
# Clear all projects since we're reloading
|
||||
# all tenants.
|
||||
connection.clearBranchCache()
|
||||
ltime = self.zk_client.getCurrentLtime()
|
||||
# Consider the branch cache valid only after we
|
||||
# cleared it
|
||||
|
|
|
@ -36,7 +36,7 @@ import prometheus_client
|
|||
import zuul.executor.common
|
||||
from zuul import exceptions
|
||||
from zuul.configloader import ConfigLoader
|
||||
from zuul.connection import BaseConnection
|
||||
from zuul.connection import BaseConnection, ReadOnlyBranchCacheError
|
||||
import zuul.lib.repl
|
||||
from zuul.lib import commandsocket, encryption, streamer_utils
|
||||
from zuul.lib.ansible import AnsibleManager
|
||||
|
@ -2048,7 +2048,11 @@ class ZuulWeb(object):
|
|||
if not self._system_config_running:
|
||||
return
|
||||
self.updateSystemConfig()
|
||||
self.updateLayout()
|
||||
if not self.updateLayout():
|
||||
# Branch cache errors with at least one tenant,
|
||||
# try again.
|
||||
time.sleep(10)
|
||||
self.system_config_cache_wake_event.set()
|
||||
except Exception:
|
||||
self.log.exception("Exception while updating system config")
|
||||
|
||||
|
@ -2084,6 +2088,7 @@ class ZuulWeb(object):
|
|||
tenant_names = set(self.abide.tenants)
|
||||
tenant_names.update(self.unparsed_abide.tenants.keys())
|
||||
|
||||
success = True
|
||||
for tenant_name in tenant_names:
|
||||
# Reload the tenant if the layout changed.
|
||||
if (self.local_layout_state.get(tenant_name)
|
||||
|
@ -2091,31 +2096,48 @@ class ZuulWeb(object):
|
|||
continue
|
||||
self.log.debug("Reloading tenant %s", tenant_name)
|
||||
with tenant_read_lock(self.zk_client, tenant_name):
|
||||
layout_state = self.tenant_layout_state.get(tenant_name)
|
||||
layout_uuid = layout_state and layout_state.uuid
|
||||
|
||||
if layout_state:
|
||||
min_ltimes = self.tenant_layout_state.getMinLtimes(
|
||||
layout_state)
|
||||
branch_cache_min_ltimes = (
|
||||
layout_state.branch_cache_min_ltimes)
|
||||
else:
|
||||
# Consider all project branch caches valid if
|
||||
# we don't have a layout state.
|
||||
min_ltimes = defaultdict(
|
||||
lambda: defaultdict(lambda: -1))
|
||||
branch_cache_min_ltimes = defaultdict(lambda: -1)
|
||||
|
||||
# The tenant will be stored in self.abide.tenants after
|
||||
# it was loaded.
|
||||
tenant = loader.loadTenant(
|
||||
self.abide, tenant_name, self.ansible_manager,
|
||||
self.unparsed_abide, min_ltimes=min_ltimes,
|
||||
layout_uuid=layout_uuid,
|
||||
branch_cache_min_ltimes=branch_cache_min_ltimes)
|
||||
if tenant is not None:
|
||||
self.local_layout_state[tenant_name] = layout_state
|
||||
else:
|
||||
with suppress(KeyError):
|
||||
del self.local_layout_state[tenant_name]
|
||||
try:
|
||||
self._updateTenantLayout(loader, tenant_name)
|
||||
except ReadOnlyBranchCacheError:
|
||||
self.log.info(
|
||||
"Unable to update layout due to incomplete branch "
|
||||
"cache, possibly due to in-progress tenant "
|
||||
"reconfiguration; will retry")
|
||||
success = False
|
||||
self.log.debug("Done updating layout state")
|
||||
return success
|
||||
|
||||
def _updateTenantLayout(self, loader, tenant_name):
|
||||
# Reload the tenant if the layout changed.
|
||||
if (self.local_layout_state.get(tenant_name)
|
||||
== self.tenant_layout_state.get(tenant_name)):
|
||||
return
|
||||
self.log.debug("Reloading tenant %s", tenant_name)
|
||||
with tenant_read_lock(self.zk_client, tenant_name):
|
||||
layout_state = self.tenant_layout_state.get(tenant_name)
|
||||
layout_uuid = layout_state and layout_state.uuid
|
||||
|
||||
if layout_state:
|
||||
min_ltimes = self.tenant_layout_state.getMinLtimes(
|
||||
layout_state)
|
||||
branch_cache_min_ltimes = (
|
||||
layout_state.branch_cache_min_ltimes)
|
||||
else:
|
||||
# Consider all project branch caches valid if
|
||||
# we don't have a layout state.
|
||||
min_ltimes = defaultdict(
|
||||
lambda: defaultdict(lambda: -1))
|
||||
branch_cache_min_ltimes = defaultdict(lambda: -1)
|
||||
|
||||
# The tenant will be stored in self.abide.tenants after
|
||||
# it was loaded.
|
||||
tenant = loader.loadTenant(
|
||||
self.abide, tenant_name, self.ansible_manager,
|
||||
self.unparsed_abide, min_ltimes=min_ltimes,
|
||||
layout_uuid=layout_uuid,
|
||||
branch_cache_min_ltimes=branch_cache_min_ltimes)
|
||||
if tenant is not None:
|
||||
self.local_layout_state[tenant_name] = layout_state
|
||||
else:
|
||||
with suppress(KeyError):
|
||||
del self.local_layout_state[tenant_name]
|
||||
|
|
|
@ -104,12 +104,17 @@ class BranchCache:
|
|||
self.cache = BranchCacheZKObject.new(
|
||||
self.zk_context, _path=data_path)
|
||||
|
||||
def clear(self):
|
||||
def clear(self, projects=None):
|
||||
"""Clear the cache"""
|
||||
with locked(self.wlock):
|
||||
with self.cache.activeContext(self.zk_context):
|
||||
self.cache.protected.clear()
|
||||
self.cache.remainder.clear()
|
||||
if projects is None:
|
||||
self.cache.protected.clear()
|
||||
self.cache.remainder.clear()
|
||||
else:
|
||||
for p in projects:
|
||||
self.cache.protected.pop(p, None)
|
||||
self.cache.remainder.pop(p, None)
|
||||
|
||||
def getProjectBranches(self, project_name, exclude_unprotected,
|
||||
min_ltime=-1):
|
||||
|
|
Loading…
Reference in New Issue