Merge "Fix zuul-web layout update on full/tenant-reconfigure"
This commit is contained in:
@@ -4419,9 +4419,16 @@ class SchedulerTestApp:
|
||||
else:
|
||||
self.sched.validateTenants(self.config, validate_tenants)
|
||||
|
||||
def fullReconfigure(self):
|
||||
def fullReconfigure(self, command_socket=False):
|
||||
try:
|
||||
self.sched.reconfigure(self.config)
|
||||
if command_socket:
|
||||
command_socket = self.sched.config.get(
|
||||
'scheduler', 'command_socket')
|
||||
with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s:
|
||||
s.connect(command_socket)
|
||||
s.sendall('full-reconfigure\n'.encode('utf8'))
|
||||
else:
|
||||
self.sched.reconfigure(self.config)
|
||||
except Exception:
|
||||
self.log.exception("Reconfiguration failed:")
|
||||
|
||||
@@ -4438,9 +4445,19 @@ class SchedulerTestApp:
|
||||
except Exception:
|
||||
self.log.exception("Reconfiguration failed:")
|
||||
|
||||
def tenantReconfigure(self, tenants):
|
||||
def tenantReconfigure(self, tenants, command_socket=False):
|
||||
try:
|
||||
self.sched.reconfigure(self.config, smart=False, tenants=tenants)
|
||||
if command_socket:
|
||||
command_socket = self.sched.config.get(
|
||||
'scheduler', 'command_socket')
|
||||
args = json.dumps(tenants)
|
||||
with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s:
|
||||
s.connect(command_socket)
|
||||
s.sendall(f'tenant-reconfigure {args}\n'.
|
||||
encode('utf8'))
|
||||
else:
|
||||
self.sched.reconfigure(
|
||||
self.config, smart=False, tenants=tenants)
|
||||
except Exception:
|
||||
self.log.exception("Reconfiguration failed:")
|
||||
|
||||
|
||||
@@ -28,6 +28,7 @@ from unittest import skip
|
||||
import requests
|
||||
|
||||
from zuul.lib.statsd import normalize_statsd_name
|
||||
from zuul.zk.locks import tenant_write_lock
|
||||
import zuul.web
|
||||
|
||||
from tests.base import ZuulTestCase, AnsibleZuulTestCase
|
||||
@@ -1235,6 +1236,94 @@ class TestWebStatusDisplayBranch(BaseTestWeb):
|
||||
class TestWebMultiTenant(BaseTestWeb):
|
||||
tenant_config_file = 'config/multi-tenant/main.yaml'
|
||||
|
||||
def test_tenant_reconfigure_command(self):
|
||||
# The 'zuul-scheduler tenant-reconfigure' and full-reconfigure
|
||||
# are used to correct problems, and as such they clear the
|
||||
# branch cache. Until the reconfiguration is complete,
|
||||
# zuul-web will be unable to load configuration for any tenant
|
||||
# which has projects that have been cleared from the branch
|
||||
# cache. This test verifies that we retry that operation
|
||||
# after encountering missing branch errors.
|
||||
sched = self.scheds.first.sched
|
||||
web = self.web.web
|
||||
# Don't perform any automatic config updates on zuul web so
|
||||
# that we can control the sequencing.
|
||||
self.web.web._system_config_running = False
|
||||
self.web.web.system_config_cache_wake_event.set()
|
||||
self.web.web.system_config_thread.join()
|
||||
|
||||
first_state = sched.tenant_layout_state.get('tenant-one')
|
||||
self.assertEqual(first_state,
|
||||
web.local_layout_state.get('tenant-one'))
|
||||
|
||||
data = self.get_url('api/tenant/tenant-one/jobs').json()
|
||||
self.assertEqual(len(data), 4)
|
||||
|
||||
# Reconfigure tenant-one so that the layout state will be
|
||||
# different and we can start a layout update in zuul-web
|
||||
# later.
|
||||
self.log.debug("Reconfigure tenant-one")
|
||||
self.scheds.first.tenantReconfigure(['tenant-one'])
|
||||
self.waitUntilSettled()
|
||||
self.log.debug("Done reconfigure tenant-one")
|
||||
|
||||
second_state = sched.tenant_layout_state.get('tenant-one')
|
||||
self.assertEqual(second_state,
|
||||
sched.local_layout_state.get('tenant-one'))
|
||||
self.assertEqual(first_state,
|
||||
web.local_layout_state.get('tenant-one'))
|
||||
|
||||
self.log.debug("Grab write lock for tenant-two")
|
||||
with tenant_write_lock(self.zk_client, 'tenant-two') as lock:
|
||||
# Start a reconfiguration of tenant-two; allow it to
|
||||
# proceed past the point that the branch cache is cleared
|
||||
# and is waiting on the lock we hold.
|
||||
self.scheds.first.tenantReconfigure(
|
||||
['tenant-two'], command_socket=True)
|
||||
for _ in iterate_timeout(30, "reconfiguration to start"):
|
||||
if 'RECONFIG' in lock.contenders():
|
||||
break
|
||||
# Now that the branch cache is cleared as part of the
|
||||
# tenant-two reconfiguration, allow zuul-web to
|
||||
# reconfigure tenant-one. This should produce an error
|
||||
# because of the missing branch cache.
|
||||
self.log.debug("Web update layout 1")
|
||||
self.web.web.updateSystemConfig()
|
||||
self.assertFalse(self.web.web.updateLayout())
|
||||
self.log.debug("Web update layout done")
|
||||
|
||||
self.assertEqual(second_state,
|
||||
sched.local_layout_state.get('tenant-one'))
|
||||
self.assertEqual(first_state,
|
||||
web.local_layout_state.get('tenant-one'))
|
||||
|
||||
# Make sure we can still access tenant-one's config via
|
||||
# zuul-web
|
||||
data = self.get_url('api/tenant/tenant-one/jobs').json()
|
||||
self.assertEqual(len(data), 4)
|
||||
self.log.debug("Release write lock for tenant-two")
|
||||
for _ in iterate_timeout(30, "reconfiguration to finish"):
|
||||
if 'RECONFIG' not in lock.contenders():
|
||||
break
|
||||
|
||||
self.log.debug("Web update layout 2")
|
||||
self.web.web.updateSystemConfig()
|
||||
self.web.web.updateLayout()
|
||||
self.log.debug("Web update layout done")
|
||||
|
||||
# Depending on tenant order, we may need to run one more time
|
||||
self.log.debug("Web update layout 3")
|
||||
self.web.web.updateSystemConfig()
|
||||
self.assertTrue(self.web.web.updateLayout())
|
||||
self.log.debug("Web update layout done")
|
||||
|
||||
self.assertEqual(second_state,
|
||||
sched.local_layout_state.get('tenant-one'))
|
||||
self.assertEqual(second_state,
|
||||
web.local_layout_state.get('tenant-one'))
|
||||
data = self.get_url('api/tenant/tenant-one/jobs').json()
|
||||
self.assertEqual(len(data), 4)
|
||||
|
||||
def test_web_labels_allowed_list(self):
|
||||
labels = ["tenant-one-label", "fake", "tenant-two-label"]
|
||||
self.fake_nodepool.registerLauncher(labels, "FakeLauncher2")
|
||||
|
||||
@@ -25,6 +25,7 @@ import voluptuous as vs
|
||||
|
||||
from zuul import change_matcher
|
||||
from zuul import model
|
||||
from zuul.connection import ReadOnlyBranchCacheError
|
||||
from zuul.lib import yamlutil as yaml
|
||||
import zuul.manager.dependent
|
||||
import zuul.manager.independent
|
||||
@@ -233,6 +234,8 @@ def project_configuration_exceptions(context, accumulator):
|
||||
yield
|
||||
except ConfigurationSyntaxError:
|
||||
raise
|
||||
except ReadOnlyBranchCacheError:
|
||||
raise
|
||||
except Exception as e:
|
||||
intro = textwrap.fill(textwrap.dedent("""\
|
||||
Zuul encountered an error while accessing the repo {repo}. The error
|
||||
|
||||
@@ -21,6 +21,10 @@ from zuul.lib.logutil import get_annotated_logger
|
||||
from zuul.model import Project
|
||||
|
||||
|
||||
class ReadOnlyBranchCacheError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
class BaseConnection(object, metaclass=abc.ABCMeta):
|
||||
"""Base class for connections.
|
||||
|
||||
@@ -241,15 +245,35 @@ class ZKBranchCacheMixin:
|
||||
# Handle the case where tenant validation doesn't use the cache
|
||||
branches = None
|
||||
|
||||
if branches is not None:
|
||||
if branches:
|
||||
return sorted(branches)
|
||||
|
||||
if self.read_only:
|
||||
if branches is None:
|
||||
# A scheduler hasn't attempted to fetch them yet
|
||||
raise ReadOnlyBranchCacheError(
|
||||
"Will not fetch project branches as read-only is set")
|
||||
# A scheduler has previously attempted a fetch, but got
|
||||
# the empty list due to an error; we can't retry since
|
||||
# we're read-only
|
||||
raise RuntimeError(
|
||||
"Will not fetch project branches as read-only is set")
|
||||
|
||||
# We need to perform a query
|
||||
branches = self._fetchProjectBranches(project, exclude_unprotected)
|
||||
try:
|
||||
branches = self._fetchProjectBranches(project, exclude_unprotected)
|
||||
except Exception:
|
||||
# We weren't able to get the branches. We need to tell
|
||||
# future schedulers to try again but tell zuul-web that we
|
||||
# tried and failed. Set the branches to the empty list to
|
||||
# indicate that we have performed a fetch and retrieved no
|
||||
# data. Any time we encounter the empty list in the
|
||||
# cache, we will try again (since it is not reasonable to
|
||||
# have a project with no branches).
|
||||
if self._branch_cache:
|
||||
self._branch_cache.setProjectBranches(
|
||||
project.name, exclude_unprotected, [])
|
||||
raise
|
||||
self.log.info("Got branches for %s" % project.name)
|
||||
|
||||
if self._branch_cache:
|
||||
@@ -315,14 +339,14 @@ class ZKBranchCacheMixin:
|
||||
# again.
|
||||
event.branch_protected = True
|
||||
|
||||
def clearBranchCache(self):
|
||||
def clearBranchCache(self, projects=None):
|
||||
"""Clear the branch cache
|
||||
|
||||
In case the branch cache gets out of sync with the source,
|
||||
this method can be called to clear it and force querying the
|
||||
source the next time the cache is used.
|
||||
"""
|
||||
self._branch_cache.clear()
|
||||
self._branch_cache.clear(projects)
|
||||
|
||||
|
||||
class ZKChangeCacheMixin:
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
import socket
|
||||
import sys
|
||||
@@ -1365,10 +1366,27 @@ class Scheduler(threading.Thread):
|
||||
# Consider caches valid if the cache ltime >= event ltime
|
||||
min_ltimes = defaultdict(
|
||||
lambda: defaultdict(lambda: event.zuul_event_ltime))
|
||||
# Invalidate the branch cache for all connections
|
||||
# Invalidate the branch cache
|
||||
for connection in self.connections.connections.values():
|
||||
if hasattr(connection, 'clearBranchCache'):
|
||||
connection.clearBranchCache()
|
||||
if event.tenants:
|
||||
# Only clear the projects used by this
|
||||
# tenant (zuul-web won't be able to load
|
||||
# any tenants that we don't immediately
|
||||
# reconfigure after clearing)
|
||||
for tenant_name in event.tenants:
|
||||
projects = [
|
||||
tpc.project.name for tpc in
|
||||
itertools.chain(
|
||||
self.abide.getConfigTPCs(tenant_name),
|
||||
self.abide.getUntrustedTPCs(
|
||||
tenant_name))
|
||||
]
|
||||
connection.clearBranchCache(projects)
|
||||
else:
|
||||
# Clear all projects since we're reloading
|
||||
# all tenants.
|
||||
connection.clearBranchCache()
|
||||
ltime = self.zk_client.getCurrentLtime()
|
||||
# Consider the branch cache valid only after we
|
||||
# cleared it
|
||||
|
||||
@@ -36,7 +36,7 @@ import prometheus_client
|
||||
import zuul.executor.common
|
||||
from zuul import exceptions
|
||||
from zuul.configloader import ConfigLoader
|
||||
from zuul.connection import BaseConnection
|
||||
from zuul.connection import BaseConnection, ReadOnlyBranchCacheError
|
||||
import zuul.lib.repl
|
||||
from zuul.lib import commandsocket, encryption, streamer_utils
|
||||
from zuul.lib.ansible import AnsibleManager
|
||||
@@ -2048,7 +2048,11 @@ class ZuulWeb(object):
|
||||
if not self._system_config_running:
|
||||
return
|
||||
self.updateSystemConfig()
|
||||
self.updateLayout()
|
||||
if not self.updateLayout():
|
||||
# Branch cache errors with at least one tenant,
|
||||
# try again.
|
||||
time.sleep(10)
|
||||
self.system_config_cache_wake_event.set()
|
||||
except Exception:
|
||||
self.log.exception("Exception while updating system config")
|
||||
|
||||
@@ -2084,38 +2088,51 @@ class ZuulWeb(object):
|
||||
tenant_names = set(self.abide.tenants)
|
||||
tenant_names.update(self.unparsed_abide.tenants.keys())
|
||||
|
||||
success = True
|
||||
for tenant_name in tenant_names:
|
||||
# Reload the tenant if the layout changed.
|
||||
if (self.local_layout_state.get(tenant_name)
|
||||
== self.tenant_layout_state.get(tenant_name)):
|
||||
continue
|
||||
self.log.debug("Reloading tenant %s", tenant_name)
|
||||
with tenant_read_lock(self.zk_client, tenant_name):
|
||||
layout_state = self.tenant_layout_state.get(tenant_name)
|
||||
layout_uuid = layout_state and layout_state.uuid
|
||||
|
||||
if layout_state:
|
||||
min_ltimes = self.tenant_layout_state.getMinLtimes(
|
||||
layout_state)
|
||||
branch_cache_min_ltimes = (
|
||||
layout_state.branch_cache_min_ltimes)
|
||||
else:
|
||||
# Consider all project branch caches valid if
|
||||
# we don't have a layout state.
|
||||
min_ltimes = defaultdict(
|
||||
lambda: defaultdict(lambda: -1))
|
||||
branch_cache_min_ltimes = defaultdict(lambda: -1)
|
||||
|
||||
# The tenant will be stored in self.abide.tenants after
|
||||
# it was loaded.
|
||||
tenant = loader.loadTenant(
|
||||
self.abide, tenant_name, self.ansible_manager,
|
||||
self.unparsed_abide, min_ltimes=min_ltimes,
|
||||
layout_uuid=layout_uuid,
|
||||
branch_cache_min_ltimes=branch_cache_min_ltimes)
|
||||
if tenant is not None:
|
||||
self.local_layout_state[tenant_name] = layout_state
|
||||
else:
|
||||
with suppress(KeyError):
|
||||
del self.local_layout_state[tenant_name]
|
||||
try:
|
||||
self._updateTenantLayout(loader, tenant_name)
|
||||
except ReadOnlyBranchCacheError:
|
||||
self.log.info(
|
||||
"Unable to update layout due to incomplete branch "
|
||||
"cache, possibly due to in-progress tenant "
|
||||
"reconfiguration; will retry")
|
||||
success = False
|
||||
self.log.debug("Done updating layout state")
|
||||
return success
|
||||
|
||||
def _updateTenantLayout(self, loader, tenant_name):
|
||||
# Reload the tenant if the layout changed.
|
||||
if (self.local_layout_state.get(tenant_name)
|
||||
== self.tenant_layout_state.get(tenant_name)):
|
||||
return
|
||||
self.log.debug("Reloading tenant %s", tenant_name)
|
||||
with tenant_read_lock(self.zk_client, tenant_name):
|
||||
layout_state = self.tenant_layout_state.get(tenant_name)
|
||||
layout_uuid = layout_state and layout_state.uuid
|
||||
|
||||
if layout_state:
|
||||
min_ltimes = self.tenant_layout_state.getMinLtimes(
|
||||
layout_state)
|
||||
branch_cache_min_ltimes = (
|
||||
layout_state.branch_cache_min_ltimes)
|
||||
else:
|
||||
# Consider all project branch caches valid if
|
||||
# we don't have a layout state.
|
||||
min_ltimes = defaultdict(
|
||||
lambda: defaultdict(lambda: -1))
|
||||
branch_cache_min_ltimes = defaultdict(lambda: -1)
|
||||
|
||||
# The tenant will be stored in self.abide.tenants after
|
||||
# it was loaded.
|
||||
tenant = loader.loadTenant(
|
||||
self.abide, tenant_name, self.ansible_manager,
|
||||
self.unparsed_abide, min_ltimes=min_ltimes,
|
||||
layout_uuid=layout_uuid,
|
||||
branch_cache_min_ltimes=branch_cache_min_ltimes)
|
||||
if tenant is not None:
|
||||
self.local_layout_state[tenant_name] = layout_state
|
||||
else:
|
||||
with suppress(KeyError):
|
||||
del self.local_layout_state[tenant_name]
|
||||
|
||||
@@ -104,12 +104,17 @@ class BranchCache:
|
||||
self.cache = BranchCacheZKObject.new(
|
||||
self.zk_context, _path=data_path)
|
||||
|
||||
def clear(self):
|
||||
def clear(self, projects=None):
|
||||
"""Clear the cache"""
|
||||
with locked(self.wlock):
|
||||
with self.cache.activeContext(self.zk_context):
|
||||
self.cache.protected.clear()
|
||||
self.cache.remainder.clear()
|
||||
if projects is None:
|
||||
self.cache.protected.clear()
|
||||
self.cache.remainder.clear()
|
||||
else:
|
||||
for p in projects:
|
||||
self.cache.protected.pop(p, None)
|
||||
self.cache.remainder.pop(p, None)
|
||||
|
||||
def getProjectBranches(self, project_name, exclude_unprotected,
|
||||
min_ltime=-1):
|
||||
|
||||
Reference in New Issue
Block a user