Add support for smart reconfigurations
Currently we only can modify the tenant configuration by triggering a full reconfiguration. However with many large tenants this can take a long time to finish. Zuul is stalled during this process. Especially when the system is at quota this can lead to long job queues that build up just after the reconfiguration. This adds support for a smart reconfiguration that only reconfigures tenants that changed their config. This can speed up the reconfiguration a lot in large multi-tenant systems. Change-Id: I6240b2850d8961a63c17d799f9bec96705435f19
This commit is contained in:
parent
865c952c83
commit
0336205981
|
@ -356,13 +356,25 @@ Operation
|
|||
To start the scheduler, run ``zuul-scheduler``. To stop it, kill the
|
||||
PID which was saved in the pidfile specified in the configuration.
|
||||
|
||||
Reconfiguration
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
Most of Zuul's configuration is automatically updated as changes to
|
||||
the repositories which contain it are merged. However, Zuul must be
|
||||
explicitly notified of changes to the tenant config file, since it is
|
||||
not read from a git repository. To do so, run
|
||||
``zuul-scheduler full-reconfigure``. The signal based method by sending
|
||||
not read from a git repository. Zuul supports two kinds of reconfigurations.
|
||||
|
||||
The full reconfiguration refetches and reloads the configuration of all
|
||||
tenants. To do so, run `zuul-scheduler full-reconfigure`. For example this
|
||||
can be used to fix eventual configuration inconsistencies after connection
|
||||
problems to Gerrit/Gibhub. The signal based method by sending
|
||||
a `SIGHUP` signal to the scheduler PID is deprecated.
|
||||
|
||||
The smart reconfiguration reloads only the tenants that changed their
|
||||
configuration in the tenant config file. To do so, run
|
||||
`zuul-scheduler smart-reconfigure`. In multi tenant systems this can be much
|
||||
faster than the full reconfiguration so it is recommended to use the smart
|
||||
reconfiguration after changing the tenant configuration file.
|
||||
|
||||
Merger
|
||||
------
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
features:
|
||||
- |
|
||||
Zuul now supports triggering a smart reconfiguration by using the command
|
||||
``zuul-scheduler smart-reconfigure``.
|
|
@ -3346,6 +3346,18 @@ class ZuulTestCase(BaseTestCase):
|
|||
except Exception:
|
||||
self.log.exception("Reconfiguration failed:")
|
||||
|
||||
def smartReconfigure(self, command_socket=False):
|
||||
try:
|
||||
if command_socket:
|
||||
command_socket = self.config.get('scheduler', 'command_socket')
|
||||
with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s:
|
||||
s.connect(command_socket)
|
||||
s.sendall('smart-reconfigure\n'.encode('utf8'))
|
||||
else:
|
||||
self.sched.reconfigure(self.config, smart=True)
|
||||
except Exception:
|
||||
self.log.exception("Reconfiguration failed:")
|
||||
|
||||
def configure_connections(self, source_only=False):
|
||||
# Set up gerrit related fakes
|
||||
# Set a changes database so multiple FakeGerrit's can report back to
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
- project:
|
||||
check:
|
||||
jobs:
|
||||
- python27
|
|
@ -0,0 +1 @@
|
|||
test
|
|
@ -0,0 +1,4 @@
|
|||
- project:
|
||||
check:
|
||||
jobs:
|
||||
- python27
|
|
@ -0,0 +1 @@
|
|||
test
|
|
@ -0,0 +1,4 @@
|
|||
- project:
|
||||
check:
|
||||
jobs:
|
||||
- python27
|
|
@ -0,0 +1 @@
|
|||
test
|
|
@ -0,0 +1,40 @@
|
|||
- tenant:
|
||||
name: tenant-one
|
||||
max-job-timeout: 1800
|
||||
allowed-reporters:
|
||||
- gerrit
|
||||
allowed-labels:
|
||||
- tenant-one-.*
|
||||
- ubuntu-trusty
|
||||
- fake
|
||||
source:
|
||||
gerrit:
|
||||
config-projects:
|
||||
- common-config
|
||||
- tenant-one-config
|
||||
untrusted-projects:
|
||||
- org/project1
|
||||
|
||||
- tenant:
|
||||
name: tenant-two
|
||||
max-nodes-per-job: 10
|
||||
allowed-triggers: gerrit
|
||||
source:
|
||||
gerrit:
|
||||
config-projects:
|
||||
- common-config
|
||||
- tenant-two-config
|
||||
untrusted-projects:
|
||||
- org/project2
|
||||
- org/project2b
|
||||
|
||||
- tenant:
|
||||
name: tenant-four
|
||||
max-nodes-per-job: 10
|
||||
allowed-triggers: gerrit
|
||||
source:
|
||||
gerrit:
|
||||
config-projects:
|
||||
- common-config
|
||||
untrusted-projects:
|
||||
- org/project4
|
|
@ -26,3 +26,14 @@
|
|||
- tenant-two-config
|
||||
untrusted-projects:
|
||||
- org/project2
|
||||
|
||||
- tenant:
|
||||
name: tenant-three
|
||||
max-nodes-per-job: 10
|
||||
allowed-triggers: gerrit
|
||||
source:
|
||||
gerrit:
|
||||
config-projects:
|
||||
- common-config
|
||||
untrusted-projects:
|
||||
- org/project3
|
||||
|
|
|
@ -8025,3 +8025,96 @@ class TestReportBuildPage(ZuulTestCase):
|
|||
dict(name='python27', result='SUCCESS', changes='1,1'),
|
||||
])
|
||||
self.assertIn('python27 finger://', A.messages[0])
|
||||
|
||||
|
||||
class TestSchedulerSmartReconfiguration(ZuulTestCase):
|
||||
tenant_config_file = 'config/multi-tenant/main.yaml'
|
||||
|
||||
def _test_smart_reconfiguration(self, command_socket=False):
|
||||
"""
|
||||
Tests that smart reconfiguration works
|
||||
|
||||
In this scenario we have the tenants tenant-one, tenant-two and
|
||||
tenant-three. We make the following changes and then trigger a smart
|
||||
reconfiguration:
|
||||
- tenant-one remains unchanged
|
||||
- tenant-two gets another repo
|
||||
- tenant-three gets removed completely
|
||||
- tenant-four is a new tenant
|
||||
"""
|
||||
self.executor_server.hold_jobs_in_build = True
|
||||
|
||||
# Create changes for all tenants
|
||||
A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A')
|
||||
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
|
||||
B = self.fake_gerrit.addFakeChange('org/project2', 'master', 'B')
|
||||
self.fake_gerrit.addEvent(B.getPatchsetCreatedEvent(1))
|
||||
C = self.fake_gerrit.addFakeChange('org/project3', 'master', 'C')
|
||||
self.fake_gerrit.addEvent(C.getPatchsetCreatedEvent(1))
|
||||
|
||||
# record previous tenant reconfiguration time, which may not be set
|
||||
old_one = self.sched.tenant_last_reconfigured.get('tenant-one', 0)
|
||||
old_two = self.sched.tenant_last_reconfigured.get('tenant-two', 0)
|
||||
self.waitUntilSettled()
|
||||
|
||||
self.newTenantConfig('config/multi-tenant/main-reconfig.yaml')
|
||||
|
||||
self.smartReconfigure(command_socket=command_socket)
|
||||
|
||||
# Wait for smart reconfiguration. Only tenant-two should be
|
||||
# reconfigured. Note that waitUntilSettled is not
|
||||
# reliable here because the reconfigure event may arrive in the
|
||||
# event queue after waitUntilSettled.
|
||||
start = time.time()
|
||||
while True:
|
||||
if time.time() - start > 15:
|
||||
raise Exception("Timeout waiting for smart reconfiguration")
|
||||
new_two = self.sched.tenant_last_reconfigured.get('tenant-two', 0)
|
||||
if old_two < new_two:
|
||||
break
|
||||
else:
|
||||
time.sleep(0.1)
|
||||
|
||||
# Ensure that tenant-one has not been reconfigured
|
||||
self.waitUntilSettled()
|
||||
new_one = self.sched.tenant_last_reconfigured.get('tenant-one', 0)
|
||||
self.assertEqual(old_one, new_one)
|
||||
|
||||
self.executor_server.hold_jobs_in_build = False
|
||||
self.executor_server.release()
|
||||
self.waitUntilSettled()
|
||||
|
||||
# Changes in tenant-one and tenant-two have to be reported
|
||||
self.assertEqual(1, A.reported)
|
||||
self.assertEqual(1, B.reported)
|
||||
|
||||
# The tenant-three has been removed so nothing should be reported
|
||||
self.assertEqual(0, C.reported)
|
||||
|
||||
# Verify known tenants
|
||||
expected_tenants = {'tenant-one', 'tenant-two', 'tenant-four'}
|
||||
self.assertEqual(expected_tenants, self.sched.abide.tenants.keys())
|
||||
|
||||
self.assertIsNotNone(
|
||||
self.sched.tenant_last_reconfigured.get('tenant-four'),
|
||||
'Tenant tenant-four should exist now.')
|
||||
|
||||
# Test that the new tenant-four actually works
|
||||
D = self.fake_gerrit.addFakeChange('org/project4', 'master', 'D')
|
||||
self.fake_gerrit.addEvent(D.getPatchsetCreatedEvent(1))
|
||||
self.waitUntilSettled()
|
||||
self.assertEqual(1, D.reported)
|
||||
|
||||
# Test that the new project in tenant-two works
|
||||
B2 = self.fake_gerrit.addFakeChange('org/project2b', 'master', 'B2')
|
||||
self.fake_gerrit.addEvent(B2.getPatchsetCreatedEvent(1))
|
||||
self.waitUntilSettled()
|
||||
self.assertEqual(1, B2.reported)
|
||||
|
||||
def test_smart_reconfiguration(self):
|
||||
"Test that live reconfiguration works"
|
||||
self._test_smart_reconfiguration()
|
||||
|
||||
def test_smart_reconfiguration_command_socket(self):
|
||||
"Test that live reconfiguration works using command socket"
|
||||
self._test_smart_reconfiguration(command_socket=True)
|
||||
|
|
|
@ -59,6 +59,15 @@ class Scheduler(zuul.cmd.ZuulDaemonApp):
|
|||
except Exception:
|
||||
self.log.exception("Reconfiguration failed:")
|
||||
|
||||
def smartReconfigure(self):
|
||||
self.log.debug("Smart reconfiguration triggered")
|
||||
self.readConfig()
|
||||
self.setup_logging('scheduler', 'log_config')
|
||||
try:
|
||||
self.sched.reconfigure(self.config, smart=True)
|
||||
except Exception:
|
||||
self.log.exception("Reconfiguration failed:")
|
||||
|
||||
def reconfigure_handler(self, signum, frame):
|
||||
signal.signal(signal.SIGHUP, signal.SIG_IGN)
|
||||
self.fullReconfigure()
|
||||
|
|
|
@ -2173,17 +2173,30 @@ class ConfigLoader(object):
|
|||
self.log.warning(err.error)
|
||||
return abide
|
||||
|
||||
def reloadTenant(self, abide, tenant, ansible_manager):
|
||||
def reloadTenant(self, abide, tenant, ansible_manager,
|
||||
unparsed_abide=None):
|
||||
new_abide = model.Abide()
|
||||
new_abide.tenants = abide.tenants.copy()
|
||||
new_abide.admin_rules = abide.admin_rules.copy()
|
||||
new_abide.unparsed_project_branch_cache = \
|
||||
abide.unparsed_project_branch_cache
|
||||
|
||||
if unparsed_abide:
|
||||
# We got a new unparsed abide so re-load the tenant completely.
|
||||
# First check if the tenant is still existing and if not remove
|
||||
# from the abide.
|
||||
if tenant.name not in unparsed_abide.known_tenants:
|
||||
del new_abide.tenants[tenant.name]
|
||||
return new_abide
|
||||
|
||||
unparsed_config = next(t for t in unparsed_abide.tenants
|
||||
if t['name'] == tenant.name)
|
||||
else:
|
||||
unparsed_config = tenant.unparsed_config
|
||||
|
||||
# When reloading a tenant only, use cached data if available.
|
||||
new_tenant = self.tenant_parser.fromYaml(
|
||||
new_abide,
|
||||
tenant.unparsed_config, ansible_manager)
|
||||
new_abide, unparsed_config, ansible_manager)
|
||||
new_abide.tenants[tenant.name] = new_tenant
|
||||
if len(new_tenant.layout.loading_errors):
|
||||
self.log.warning(
|
||||
|
|
|
@ -40,9 +40,9 @@ from zuul.lib.logutil import get_annotated_logger
|
|||
from zuul.lib.statsd import get_statsd
|
||||
import zuul.lib.queue
|
||||
import zuul.lib.repl
|
||||
from zuul.model import Build, HoldRequest
|
||||
from zuul.model import Build, HoldRequest, Tenant
|
||||
|
||||
COMMANDS = ['full-reconfigure', 'stop', 'repl', 'norepl']
|
||||
COMMANDS = ['full-reconfigure', 'smart-reconfigure', 'stop', 'repl', 'norepl']
|
||||
|
||||
|
||||
class ManagementEvent(object):
|
||||
|
@ -79,6 +79,17 @@ class ReconfigureEvent(ManagementEvent):
|
|||
self.config = config
|
||||
|
||||
|
||||
class SmartReconfigureEvent(ManagementEvent):
|
||||
"""Reconfigure the scheduler. The layout will be (re-)loaded from
|
||||
the path specified in the configuration.
|
||||
|
||||
:arg ConfigParser config: the new configuration
|
||||
"""
|
||||
def __init__(self, config, smart=False):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
|
||||
|
||||
class TenantReconfigureEvent(ManagementEvent):
|
||||
"""Reconfigure the given tenant. The layout will be (re-)loaded from
|
||||
the path specified in the configuration.
|
||||
|
@ -282,6 +293,7 @@ class Scheduler(threading.Thread):
|
|||
self.command_map = {
|
||||
'stop': self.stop,
|
||||
'full-reconfigure': self.fullReconfigureCommandHandler,
|
||||
'smart-reconfigure': self.smartReconfigureCommandHandler,
|
||||
'repl': self.start_repl,
|
||||
'norepl': self.stop_repl,
|
||||
}
|
||||
|
@ -534,6 +546,9 @@ class Scheduler(threading.Thread):
|
|||
def fullReconfigureCommandHandler(self):
|
||||
self._zuul_app.fullReconfigure()
|
||||
|
||||
def smartReconfigureCommandHandler(self):
|
||||
self._zuul_app.smartReconfigure()
|
||||
|
||||
def start_repl(self):
|
||||
if self.repl:
|
||||
return
|
||||
|
@ -546,9 +561,12 @@ class Scheduler(threading.Thread):
|
|||
self.repl.stop()
|
||||
self.repl = None
|
||||
|
||||
def reconfigure(self, config):
|
||||
def reconfigure(self, config, smart=False):
|
||||
self.log.debug("Submitting reconfiguration event")
|
||||
event = ReconfigureEvent(config)
|
||||
if smart:
|
||||
event = SmartReconfigureEvent(config)
|
||||
else:
|
||||
event = ReconfigureEvent(config)
|
||||
self.management_event_queue.put(event)
|
||||
self.wake_event.set()
|
||||
self.log.debug("Waiting for reconfiguration")
|
||||
|
@ -811,6 +829,60 @@ class Scheduler(threading.Thread):
|
|||
self.log.info("Full reconfiguration complete (duration: %s seconds)",
|
||||
duration)
|
||||
|
||||
def _doSmartReconfigureEvent(self, event):
|
||||
# This is called in the scheduler loop after another thread submits
|
||||
# a request
|
||||
reconfigured_tenants = []
|
||||
with self.layout_lock:
|
||||
self.config = event.config
|
||||
self.log.info("Smart reconfiguration beginning")
|
||||
start = time.monotonic()
|
||||
|
||||
# Reload the ansible manager in case the default ansible version
|
||||
# changed.
|
||||
default_ansible_version = get_default(
|
||||
self.config, 'scheduler', 'default_ansible_version', None)
|
||||
self.ansible_manager = AnsibleManager(
|
||||
default_version=default_ansible_version)
|
||||
|
||||
loader = configloader.ConfigLoader(
|
||||
self.connections, self, self.merger,
|
||||
self._get_key_dir())
|
||||
tenant_config, script = self._checkTenantSourceConf(self.config)
|
||||
old_unparsed_abide = self.unparsed_abide
|
||||
self.unparsed_abide = loader.readConfig(
|
||||
tenant_config, from_script=script)
|
||||
|
||||
# We need to handle new and deleted tenants so we need to process
|
||||
# all tenants from the currently known and the new ones.
|
||||
tenant_names = {t for t in self.abide.tenants}
|
||||
tenant_names.update(self.unparsed_abide.known_tenants)
|
||||
for tenant_name in tenant_names:
|
||||
old_tenant = [x for x in old_unparsed_abide.tenants
|
||||
if x['name'] == tenant_name]
|
||||
new_tenant = [x for x in self.unparsed_abide.tenants
|
||||
if x['name'] == tenant_name]
|
||||
if old_tenant == new_tenant:
|
||||
continue
|
||||
|
||||
reconfigured_tenants.append(tenant_name)
|
||||
old_tenant = self.abide.tenants.get(tenant_name)
|
||||
if old_tenant is None:
|
||||
# If there is no old tenant, use a fake tenant with the
|
||||
# correct name
|
||||
old_tenant = Tenant(tenant_name)
|
||||
abide = loader.reloadTenant(
|
||||
self.abide, old_tenant, self.ansible_manager,
|
||||
self.unparsed_abide)
|
||||
|
||||
tenant = abide.tenants.get(tenant_name)
|
||||
if tenant is not None:
|
||||
self._reconfigureTenant(tenant)
|
||||
self.abide = abide
|
||||
duration = round(time.monotonic() - start, 3)
|
||||
self.log.info("Smart reconfiguration of tenants %s complete "
|
||||
"(duration: %s seconds)", reconfigured_tenants, duration)
|
||||
|
||||
def _doTenantReconfigureEvent(self, event):
|
||||
# This is called in the scheduler loop after another thread submits
|
||||
# a request
|
||||
|
@ -1242,6 +1314,8 @@ class Scheduler(threading.Thread):
|
|||
try:
|
||||
if isinstance(event, ReconfigureEvent):
|
||||
self._doReconfigureEvent(event)
|
||||
if isinstance(event, SmartReconfigureEvent):
|
||||
self._doSmartReconfigureEvent(event)
|
||||
elif isinstance(event, TenantReconfigureEvent):
|
||||
self._doTenantReconfigureEvent(event)
|
||||
elif isinstance(event, PromoteEvent):
|
||||
|
|
Loading…
Reference in New Issue