Add support for smart reconfigurations

Currently we only can modify the tenant configuration by triggering a
full reconfiguration. However with many large tenants this can take a
long time to finish. Zuul is stalled during this process. Especially
when the system is at quota this can lead to long job queues that
build up just after the reconfiguration. This adds support for a smart
reconfiguration that only reconfigures tenants that changed their
config. This can speed up the reconfiguration a lot in large
multi-tenant systems.

Change-Id: I6240b2850d8961a63c17d799f9bec96705435f19
This commit is contained in:
Tobias Henkel 2019-04-12 19:09:57 +02:00
parent 865c952c83
commit 0336205981
15 changed files with 293 additions and 9 deletions

View File

@ -356,13 +356,25 @@ Operation
To start the scheduler, run ``zuul-scheduler``. To stop it, kill the
PID which was saved in the pidfile specified in the configuration.
Reconfiguration
~~~~~~~~~~~~~~~
Most of Zuul's configuration is automatically updated as changes to
the repositories which contain it are merged. However, Zuul must be
explicitly notified of changes to the tenant config file, since it is
not read from a git repository. To do so, run
``zuul-scheduler full-reconfigure``. The signal based method by sending
not read from a git repository. Zuul supports two kinds of reconfigurations.
The full reconfiguration refetches and reloads the configuration of all
tenants. To do so, run `zuul-scheduler full-reconfigure`. For example this
can be used to fix eventual configuration inconsistencies after connection
problems to Gerrit/Gibhub. The signal based method by sending
a `SIGHUP` signal to the scheduler PID is deprecated.
The smart reconfiguration reloads only the tenants that changed their
configuration in the tenant config file. To do so, run
`zuul-scheduler smart-reconfigure`. In multi tenant systems this can be much
faster than the full reconfiguration so it is recommended to use the smart
reconfiguration after changing the tenant configuration file.
Merger
------

View File

@ -0,0 +1,5 @@
---
features:
- |
Zuul now supports triggering a smart reconfiguration by using the command
``zuul-scheduler smart-reconfigure``.

View File

@ -3346,6 +3346,18 @@ class ZuulTestCase(BaseTestCase):
except Exception:
self.log.exception("Reconfiguration failed:")
def smartReconfigure(self, command_socket=False):
try:
if command_socket:
command_socket = self.config.get('scheduler', 'command_socket')
with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s:
s.connect(command_socket)
s.sendall('smart-reconfigure\n'.encode('utf8'))
else:
self.sched.reconfigure(self.config, smart=True)
except Exception:
self.log.exception("Reconfiguration failed:")
def configure_connections(self, source_only=False):
# Set up gerrit related fakes
# Set a changes database so multiple FakeGerrit's can report back to

View File

@ -0,0 +1,4 @@
- project:
check:
jobs:
- python27

View File

@ -0,0 +1 @@
test

View File

@ -0,0 +1,4 @@
- project:
check:
jobs:
- python27

View File

@ -0,0 +1 @@
test

View File

@ -0,0 +1,4 @@
- project:
check:
jobs:
- python27

View File

@ -0,0 +1 @@
test

View File

@ -0,0 +1,40 @@
- tenant:
name: tenant-one
max-job-timeout: 1800
allowed-reporters:
- gerrit
allowed-labels:
- tenant-one-.*
- ubuntu-trusty
- fake
source:
gerrit:
config-projects:
- common-config
- tenant-one-config
untrusted-projects:
- org/project1
- tenant:
name: tenant-two
max-nodes-per-job: 10
allowed-triggers: gerrit
source:
gerrit:
config-projects:
- common-config
- tenant-two-config
untrusted-projects:
- org/project2
- org/project2b
- tenant:
name: tenant-four
max-nodes-per-job: 10
allowed-triggers: gerrit
source:
gerrit:
config-projects:
- common-config
untrusted-projects:
- org/project4

View File

@ -26,3 +26,14 @@
- tenant-two-config
untrusted-projects:
- org/project2
- tenant:
name: tenant-three
max-nodes-per-job: 10
allowed-triggers: gerrit
source:
gerrit:
config-projects:
- common-config
untrusted-projects:
- org/project3

View File

@ -8025,3 +8025,96 @@ class TestReportBuildPage(ZuulTestCase):
dict(name='python27', result='SUCCESS', changes='1,1'),
])
self.assertIn('python27 finger://', A.messages[0])
class TestSchedulerSmartReconfiguration(ZuulTestCase):
tenant_config_file = 'config/multi-tenant/main.yaml'
def _test_smart_reconfiguration(self, command_socket=False):
"""
Tests that smart reconfiguration works
In this scenario we have the tenants tenant-one, tenant-two and
tenant-three. We make the following changes and then trigger a smart
reconfiguration:
- tenant-one remains unchanged
- tenant-two gets another repo
- tenant-three gets removed completely
- tenant-four is a new tenant
"""
self.executor_server.hold_jobs_in_build = True
# Create changes for all tenants
A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A')
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
B = self.fake_gerrit.addFakeChange('org/project2', 'master', 'B')
self.fake_gerrit.addEvent(B.getPatchsetCreatedEvent(1))
C = self.fake_gerrit.addFakeChange('org/project3', 'master', 'C')
self.fake_gerrit.addEvent(C.getPatchsetCreatedEvent(1))
# record previous tenant reconfiguration time, which may not be set
old_one = self.sched.tenant_last_reconfigured.get('tenant-one', 0)
old_two = self.sched.tenant_last_reconfigured.get('tenant-two', 0)
self.waitUntilSettled()
self.newTenantConfig('config/multi-tenant/main-reconfig.yaml')
self.smartReconfigure(command_socket=command_socket)
# Wait for smart reconfiguration. Only tenant-two should be
# reconfigured. Note that waitUntilSettled is not
# reliable here because the reconfigure event may arrive in the
# event queue after waitUntilSettled.
start = time.time()
while True:
if time.time() - start > 15:
raise Exception("Timeout waiting for smart reconfiguration")
new_two = self.sched.tenant_last_reconfigured.get('tenant-two', 0)
if old_two < new_two:
break
else:
time.sleep(0.1)
# Ensure that tenant-one has not been reconfigured
self.waitUntilSettled()
new_one = self.sched.tenant_last_reconfigured.get('tenant-one', 0)
self.assertEqual(old_one, new_one)
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()
self.waitUntilSettled()
# Changes in tenant-one and tenant-two have to be reported
self.assertEqual(1, A.reported)
self.assertEqual(1, B.reported)
# The tenant-three has been removed so nothing should be reported
self.assertEqual(0, C.reported)
# Verify known tenants
expected_tenants = {'tenant-one', 'tenant-two', 'tenant-four'}
self.assertEqual(expected_tenants, self.sched.abide.tenants.keys())
self.assertIsNotNone(
self.sched.tenant_last_reconfigured.get('tenant-four'),
'Tenant tenant-four should exist now.')
# Test that the new tenant-four actually works
D = self.fake_gerrit.addFakeChange('org/project4', 'master', 'D')
self.fake_gerrit.addEvent(D.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.assertEqual(1, D.reported)
# Test that the new project in tenant-two works
B2 = self.fake_gerrit.addFakeChange('org/project2b', 'master', 'B2')
self.fake_gerrit.addEvent(B2.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.assertEqual(1, B2.reported)
def test_smart_reconfiguration(self):
"Test that live reconfiguration works"
self._test_smart_reconfiguration()
def test_smart_reconfiguration_command_socket(self):
"Test that live reconfiguration works using command socket"
self._test_smart_reconfiguration(command_socket=True)

View File

@ -59,6 +59,15 @@ class Scheduler(zuul.cmd.ZuulDaemonApp):
except Exception:
self.log.exception("Reconfiguration failed:")
def smartReconfigure(self):
self.log.debug("Smart reconfiguration triggered")
self.readConfig()
self.setup_logging('scheduler', 'log_config')
try:
self.sched.reconfigure(self.config, smart=True)
except Exception:
self.log.exception("Reconfiguration failed:")
def reconfigure_handler(self, signum, frame):
signal.signal(signal.SIGHUP, signal.SIG_IGN)
self.fullReconfigure()

View File

@ -2173,17 +2173,30 @@ class ConfigLoader(object):
self.log.warning(err.error)
return abide
def reloadTenant(self, abide, tenant, ansible_manager):
def reloadTenant(self, abide, tenant, ansible_manager,
unparsed_abide=None):
new_abide = model.Abide()
new_abide.tenants = abide.tenants.copy()
new_abide.admin_rules = abide.admin_rules.copy()
new_abide.unparsed_project_branch_cache = \
abide.unparsed_project_branch_cache
if unparsed_abide:
# We got a new unparsed abide so re-load the tenant completely.
# First check if the tenant is still existing and if not remove
# from the abide.
if tenant.name not in unparsed_abide.known_tenants:
del new_abide.tenants[tenant.name]
return new_abide
unparsed_config = next(t for t in unparsed_abide.tenants
if t['name'] == tenant.name)
else:
unparsed_config = tenant.unparsed_config
# When reloading a tenant only, use cached data if available.
new_tenant = self.tenant_parser.fromYaml(
new_abide,
tenant.unparsed_config, ansible_manager)
new_abide, unparsed_config, ansible_manager)
new_abide.tenants[tenant.name] = new_tenant
if len(new_tenant.layout.loading_errors):
self.log.warning(

View File

@ -40,9 +40,9 @@ from zuul.lib.logutil import get_annotated_logger
from zuul.lib.statsd import get_statsd
import zuul.lib.queue
import zuul.lib.repl
from zuul.model import Build, HoldRequest
from zuul.model import Build, HoldRequest, Tenant
COMMANDS = ['full-reconfigure', 'stop', 'repl', 'norepl']
COMMANDS = ['full-reconfigure', 'smart-reconfigure', 'stop', 'repl', 'norepl']
class ManagementEvent(object):
@ -79,6 +79,17 @@ class ReconfigureEvent(ManagementEvent):
self.config = config
class SmartReconfigureEvent(ManagementEvent):
"""Reconfigure the scheduler. The layout will be (re-)loaded from
the path specified in the configuration.
:arg ConfigParser config: the new configuration
"""
def __init__(self, config, smart=False):
super().__init__()
self.config = config
class TenantReconfigureEvent(ManagementEvent):
"""Reconfigure the given tenant. The layout will be (re-)loaded from
the path specified in the configuration.
@ -282,6 +293,7 @@ class Scheduler(threading.Thread):
self.command_map = {
'stop': self.stop,
'full-reconfigure': self.fullReconfigureCommandHandler,
'smart-reconfigure': self.smartReconfigureCommandHandler,
'repl': self.start_repl,
'norepl': self.stop_repl,
}
@ -534,6 +546,9 @@ class Scheduler(threading.Thread):
def fullReconfigureCommandHandler(self):
self._zuul_app.fullReconfigure()
def smartReconfigureCommandHandler(self):
self._zuul_app.smartReconfigure()
def start_repl(self):
if self.repl:
return
@ -546,9 +561,12 @@ class Scheduler(threading.Thread):
self.repl.stop()
self.repl = None
def reconfigure(self, config):
def reconfigure(self, config, smart=False):
self.log.debug("Submitting reconfiguration event")
event = ReconfigureEvent(config)
if smart:
event = SmartReconfigureEvent(config)
else:
event = ReconfigureEvent(config)
self.management_event_queue.put(event)
self.wake_event.set()
self.log.debug("Waiting for reconfiguration")
@ -811,6 +829,60 @@ class Scheduler(threading.Thread):
self.log.info("Full reconfiguration complete (duration: %s seconds)",
duration)
def _doSmartReconfigureEvent(self, event):
# This is called in the scheduler loop after another thread submits
# a request
reconfigured_tenants = []
with self.layout_lock:
self.config = event.config
self.log.info("Smart reconfiguration beginning")
start = time.monotonic()
# Reload the ansible manager in case the default ansible version
# changed.
default_ansible_version = get_default(
self.config, 'scheduler', 'default_ansible_version', None)
self.ansible_manager = AnsibleManager(
default_version=default_ansible_version)
loader = configloader.ConfigLoader(
self.connections, self, self.merger,
self._get_key_dir())
tenant_config, script = self._checkTenantSourceConf(self.config)
old_unparsed_abide = self.unparsed_abide
self.unparsed_abide = loader.readConfig(
tenant_config, from_script=script)
# We need to handle new and deleted tenants so we need to process
# all tenants from the currently known and the new ones.
tenant_names = {t for t in self.abide.tenants}
tenant_names.update(self.unparsed_abide.known_tenants)
for tenant_name in tenant_names:
old_tenant = [x for x in old_unparsed_abide.tenants
if x['name'] == tenant_name]
new_tenant = [x for x in self.unparsed_abide.tenants
if x['name'] == tenant_name]
if old_tenant == new_tenant:
continue
reconfigured_tenants.append(tenant_name)
old_tenant = self.abide.tenants.get(tenant_name)
if old_tenant is None:
# If there is no old tenant, use a fake tenant with the
# correct name
old_tenant = Tenant(tenant_name)
abide = loader.reloadTenant(
self.abide, old_tenant, self.ansible_manager,
self.unparsed_abide)
tenant = abide.tenants.get(tenant_name)
if tenant is not None:
self._reconfigureTenant(tenant)
self.abide = abide
duration = round(time.monotonic() - start, 3)
self.log.info("Smart reconfiguration of tenants %s complete "
"(duration: %s seconds)", reconfigured_tenants, duration)
def _doTenantReconfigureEvent(self, event):
# This is called in the scheduler loop after another thread submits
# a request
@ -1242,6 +1314,8 @@ class Scheduler(threading.Thread):
try:
if isinstance(event, ReconfigureEvent):
self._doReconfigureEvent(event)
if isinstance(event, SmartReconfigureEvent):
self._doSmartReconfigureEvent(event)
elif isinstance(event, TenantReconfigureEvent):
self._doTenantReconfigureEvent(event)
elif isinstance(event, PromoteEvent):