Merge "Add support for smart reconfigurations"
This commit is contained in:
@@ -356,13 +356,25 @@ Operation
|
||||
To start the scheduler, run ``zuul-scheduler``. To stop it, kill the
|
||||
PID which was saved in the pidfile specified in the configuration.
|
||||
|
||||
Reconfiguration
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
Most of Zuul's configuration is automatically updated as changes to
|
||||
the repositories which contain it are merged. However, Zuul must be
|
||||
explicitly notified of changes to the tenant config file, since it is
|
||||
not read from a git repository. To do so, run
|
||||
``zuul-scheduler full-reconfigure``. The signal based method by sending
|
||||
not read from a git repository. Zuul supports two kinds of reconfigurations.
|
||||
|
||||
The full reconfiguration refetches and reloads the configuration of all
|
||||
tenants. To do so, run `zuul-scheduler full-reconfigure`. For example this
|
||||
can be used to fix eventual configuration inconsistencies after connection
|
||||
problems to Gerrit/Gibhub. The signal based method by sending
|
||||
a `SIGHUP` signal to the scheduler PID is deprecated.
|
||||
|
||||
The smart reconfiguration reloads only the tenants that changed their
|
||||
configuration in the tenant config file. To do so, run
|
||||
`zuul-scheduler smart-reconfigure`. In multi tenant systems this can be much
|
||||
faster than the full reconfiguration so it is recommended to use the smart
|
||||
reconfiguration after changing the tenant configuration file.
|
||||
|
||||
Merger
|
||||
------
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Zuul now supports triggering a smart reconfiguration by using the command
|
||||
``zuul-scheduler smart-reconfigure``.
|
||||
@@ -3346,6 +3346,18 @@ class ZuulTestCase(BaseTestCase):
|
||||
except Exception:
|
||||
self.log.exception("Reconfiguration failed:")
|
||||
|
||||
def smartReconfigure(self, command_socket=False):
|
||||
try:
|
||||
if command_socket:
|
||||
command_socket = self.config.get('scheduler', 'command_socket')
|
||||
with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s:
|
||||
s.connect(command_socket)
|
||||
s.sendall('smart-reconfigure\n'.encode('utf8'))
|
||||
else:
|
||||
self.sched.reconfigure(self.config, smart=True)
|
||||
except Exception:
|
||||
self.log.exception("Reconfiguration failed:")
|
||||
|
||||
def configure_connections(self, source_only=False):
|
||||
# Set up gerrit related fakes
|
||||
# Set a changes database so multiple FakeGerrit's can report back to
|
||||
|
||||
4
tests/fixtures/config/multi-tenant/git/org_project2b/.zuul.yaml
vendored
Normal file
4
tests/fixtures/config/multi-tenant/git/org_project2b/.zuul.yaml
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
- project:
|
||||
check:
|
||||
jobs:
|
||||
- python27
|
||||
1
tests/fixtures/config/multi-tenant/git/org_project2b/README
vendored
Normal file
1
tests/fixtures/config/multi-tenant/git/org_project2b/README
vendored
Normal file
@@ -0,0 +1 @@
|
||||
test
|
||||
4
tests/fixtures/config/multi-tenant/git/org_project3/.zuul.yaml
vendored
Normal file
4
tests/fixtures/config/multi-tenant/git/org_project3/.zuul.yaml
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
- project:
|
||||
check:
|
||||
jobs:
|
||||
- python27
|
||||
1
tests/fixtures/config/multi-tenant/git/org_project3/README
vendored
Normal file
1
tests/fixtures/config/multi-tenant/git/org_project3/README
vendored
Normal file
@@ -0,0 +1 @@
|
||||
test
|
||||
4
tests/fixtures/config/multi-tenant/git/org_project4/.zuul.yaml
vendored
Normal file
4
tests/fixtures/config/multi-tenant/git/org_project4/.zuul.yaml
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
- project:
|
||||
check:
|
||||
jobs:
|
||||
- python27
|
||||
1
tests/fixtures/config/multi-tenant/git/org_project4/README
vendored
Normal file
1
tests/fixtures/config/multi-tenant/git/org_project4/README
vendored
Normal file
@@ -0,0 +1 @@
|
||||
test
|
||||
40
tests/fixtures/config/multi-tenant/main-reconfig.yaml
vendored
Normal file
40
tests/fixtures/config/multi-tenant/main-reconfig.yaml
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
- tenant:
|
||||
name: tenant-one
|
||||
max-job-timeout: 1800
|
||||
allowed-reporters:
|
||||
- gerrit
|
||||
allowed-labels:
|
||||
- tenant-one-.*
|
||||
- ubuntu-trusty
|
||||
- fake
|
||||
source:
|
||||
gerrit:
|
||||
config-projects:
|
||||
- common-config
|
||||
- tenant-one-config
|
||||
untrusted-projects:
|
||||
- org/project1
|
||||
|
||||
- tenant:
|
||||
name: tenant-two
|
||||
max-nodes-per-job: 10
|
||||
allowed-triggers: gerrit
|
||||
source:
|
||||
gerrit:
|
||||
config-projects:
|
||||
- common-config
|
||||
- tenant-two-config
|
||||
untrusted-projects:
|
||||
- org/project2
|
||||
- org/project2b
|
||||
|
||||
- tenant:
|
||||
name: tenant-four
|
||||
max-nodes-per-job: 10
|
||||
allowed-triggers: gerrit
|
||||
source:
|
||||
gerrit:
|
||||
config-projects:
|
||||
- common-config
|
||||
untrusted-projects:
|
||||
- org/project4
|
||||
11
tests/fixtures/config/multi-tenant/main.yaml
vendored
11
tests/fixtures/config/multi-tenant/main.yaml
vendored
@@ -26,3 +26,14 @@
|
||||
- tenant-two-config
|
||||
untrusted-projects:
|
||||
- org/project2
|
||||
|
||||
- tenant:
|
||||
name: tenant-three
|
||||
max-nodes-per-job: 10
|
||||
allowed-triggers: gerrit
|
||||
source:
|
||||
gerrit:
|
||||
config-projects:
|
||||
- common-config
|
||||
untrusted-projects:
|
||||
- org/project3
|
||||
|
||||
@@ -8075,3 +8075,96 @@ class TestReportBuildPage(ZuulTestCase):
|
||||
dict(name='python27', result='SUCCESS', changes='1,1'),
|
||||
])
|
||||
self.assertIn('python27 finger://', A.messages[0])
|
||||
|
||||
|
||||
class TestSchedulerSmartReconfiguration(ZuulTestCase):
|
||||
tenant_config_file = 'config/multi-tenant/main.yaml'
|
||||
|
||||
def _test_smart_reconfiguration(self, command_socket=False):
|
||||
"""
|
||||
Tests that smart reconfiguration works
|
||||
|
||||
In this scenario we have the tenants tenant-one, tenant-two and
|
||||
tenant-three. We make the following changes and then trigger a smart
|
||||
reconfiguration:
|
||||
- tenant-one remains unchanged
|
||||
- tenant-two gets another repo
|
||||
- tenant-three gets removed completely
|
||||
- tenant-four is a new tenant
|
||||
"""
|
||||
self.executor_server.hold_jobs_in_build = True
|
||||
|
||||
# Create changes for all tenants
|
||||
A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A')
|
||||
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
|
||||
B = self.fake_gerrit.addFakeChange('org/project2', 'master', 'B')
|
||||
self.fake_gerrit.addEvent(B.getPatchsetCreatedEvent(1))
|
||||
C = self.fake_gerrit.addFakeChange('org/project3', 'master', 'C')
|
||||
self.fake_gerrit.addEvent(C.getPatchsetCreatedEvent(1))
|
||||
|
||||
# record previous tenant reconfiguration time, which may not be set
|
||||
old_one = self.sched.tenant_last_reconfigured.get('tenant-one', 0)
|
||||
old_two = self.sched.tenant_last_reconfigured.get('tenant-two', 0)
|
||||
self.waitUntilSettled()
|
||||
|
||||
self.newTenantConfig('config/multi-tenant/main-reconfig.yaml')
|
||||
|
||||
self.smartReconfigure(command_socket=command_socket)
|
||||
|
||||
# Wait for smart reconfiguration. Only tenant-two should be
|
||||
# reconfigured. Note that waitUntilSettled is not
|
||||
# reliable here because the reconfigure event may arrive in the
|
||||
# event queue after waitUntilSettled.
|
||||
start = time.time()
|
||||
while True:
|
||||
if time.time() - start > 15:
|
||||
raise Exception("Timeout waiting for smart reconfiguration")
|
||||
new_two = self.sched.tenant_last_reconfigured.get('tenant-two', 0)
|
||||
if old_two < new_two:
|
||||
break
|
||||
else:
|
||||
time.sleep(0.1)
|
||||
|
||||
# Ensure that tenant-one has not been reconfigured
|
||||
self.waitUntilSettled()
|
||||
new_one = self.sched.tenant_last_reconfigured.get('tenant-one', 0)
|
||||
self.assertEqual(old_one, new_one)
|
||||
|
||||
self.executor_server.hold_jobs_in_build = False
|
||||
self.executor_server.release()
|
||||
self.waitUntilSettled()
|
||||
|
||||
# Changes in tenant-one and tenant-two have to be reported
|
||||
self.assertEqual(1, A.reported)
|
||||
self.assertEqual(1, B.reported)
|
||||
|
||||
# The tenant-three has been removed so nothing should be reported
|
||||
self.assertEqual(0, C.reported)
|
||||
|
||||
# Verify known tenants
|
||||
expected_tenants = {'tenant-one', 'tenant-two', 'tenant-four'}
|
||||
self.assertEqual(expected_tenants, self.sched.abide.tenants.keys())
|
||||
|
||||
self.assertIsNotNone(
|
||||
self.sched.tenant_last_reconfigured.get('tenant-four'),
|
||||
'Tenant tenant-four should exist now.')
|
||||
|
||||
# Test that the new tenant-four actually works
|
||||
D = self.fake_gerrit.addFakeChange('org/project4', 'master', 'D')
|
||||
self.fake_gerrit.addEvent(D.getPatchsetCreatedEvent(1))
|
||||
self.waitUntilSettled()
|
||||
self.assertEqual(1, D.reported)
|
||||
|
||||
# Test that the new project in tenant-two works
|
||||
B2 = self.fake_gerrit.addFakeChange('org/project2b', 'master', 'B2')
|
||||
self.fake_gerrit.addEvent(B2.getPatchsetCreatedEvent(1))
|
||||
self.waitUntilSettled()
|
||||
self.assertEqual(1, B2.reported)
|
||||
|
||||
def test_smart_reconfiguration(self):
|
||||
"Test that live reconfiguration works"
|
||||
self._test_smart_reconfiguration()
|
||||
|
||||
def test_smart_reconfiguration_command_socket(self):
|
||||
"Test that live reconfiguration works using command socket"
|
||||
self._test_smart_reconfiguration(command_socket=True)
|
||||
|
||||
@@ -59,6 +59,15 @@ class Scheduler(zuul.cmd.ZuulDaemonApp):
|
||||
except Exception:
|
||||
self.log.exception("Reconfiguration failed:")
|
||||
|
||||
def smartReconfigure(self):
|
||||
self.log.debug("Smart reconfiguration triggered")
|
||||
self.readConfig()
|
||||
self.setup_logging('scheduler', 'log_config')
|
||||
try:
|
||||
self.sched.reconfigure(self.config, smart=True)
|
||||
except Exception:
|
||||
self.log.exception("Reconfiguration failed:")
|
||||
|
||||
def reconfigure_handler(self, signum, frame):
|
||||
signal.signal(signal.SIGHUP, signal.SIG_IGN)
|
||||
self.fullReconfigure()
|
||||
|
||||
@@ -2173,17 +2173,30 @@ class ConfigLoader(object):
|
||||
self.log.warning(err.error)
|
||||
return abide
|
||||
|
||||
def reloadTenant(self, abide, tenant, ansible_manager):
|
||||
def reloadTenant(self, abide, tenant, ansible_manager,
|
||||
unparsed_abide=None):
|
||||
new_abide = model.Abide()
|
||||
new_abide.tenants = abide.tenants.copy()
|
||||
new_abide.admin_rules = abide.admin_rules.copy()
|
||||
new_abide.unparsed_project_branch_cache = \
|
||||
abide.unparsed_project_branch_cache
|
||||
|
||||
if unparsed_abide:
|
||||
# We got a new unparsed abide so re-load the tenant completely.
|
||||
# First check if the tenant is still existing and if not remove
|
||||
# from the abide.
|
||||
if tenant.name not in unparsed_abide.known_tenants:
|
||||
del new_abide.tenants[tenant.name]
|
||||
return new_abide
|
||||
|
||||
unparsed_config = next(t for t in unparsed_abide.tenants
|
||||
if t['name'] == tenant.name)
|
||||
else:
|
||||
unparsed_config = tenant.unparsed_config
|
||||
|
||||
# When reloading a tenant only, use cached data if available.
|
||||
new_tenant = self.tenant_parser.fromYaml(
|
||||
new_abide,
|
||||
tenant.unparsed_config, ansible_manager)
|
||||
new_abide, unparsed_config, ansible_manager)
|
||||
new_abide.tenants[tenant.name] = new_tenant
|
||||
if len(new_tenant.layout.loading_errors):
|
||||
self.log.warning(
|
||||
|
||||
@@ -40,9 +40,9 @@ from zuul.lib.logutil import get_annotated_logger
|
||||
from zuul.lib.statsd import get_statsd
|
||||
import zuul.lib.queue
|
||||
import zuul.lib.repl
|
||||
from zuul.model import Build, HoldRequest
|
||||
from zuul.model import Build, HoldRequest, Tenant
|
||||
|
||||
COMMANDS = ['full-reconfigure', 'stop', 'repl', 'norepl']
|
||||
COMMANDS = ['full-reconfigure', 'smart-reconfigure', 'stop', 'repl', 'norepl']
|
||||
|
||||
|
||||
class ManagementEvent(object):
|
||||
@@ -79,6 +79,17 @@ class ReconfigureEvent(ManagementEvent):
|
||||
self.config = config
|
||||
|
||||
|
||||
class SmartReconfigureEvent(ManagementEvent):
|
||||
"""Reconfigure the scheduler. The layout will be (re-)loaded from
|
||||
the path specified in the configuration.
|
||||
|
||||
:arg ConfigParser config: the new configuration
|
||||
"""
|
||||
def __init__(self, config, smart=False):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
|
||||
|
||||
class TenantReconfigureEvent(ManagementEvent):
|
||||
"""Reconfigure the given tenant. The layout will be (re-)loaded from
|
||||
the path specified in the configuration.
|
||||
@@ -282,6 +293,7 @@ class Scheduler(threading.Thread):
|
||||
self.command_map = {
|
||||
'stop': self.stop,
|
||||
'full-reconfigure': self.fullReconfigureCommandHandler,
|
||||
'smart-reconfigure': self.smartReconfigureCommandHandler,
|
||||
'repl': self.start_repl,
|
||||
'norepl': self.stop_repl,
|
||||
}
|
||||
@@ -534,6 +546,9 @@ class Scheduler(threading.Thread):
|
||||
def fullReconfigureCommandHandler(self):
|
||||
self._zuul_app.fullReconfigure()
|
||||
|
||||
def smartReconfigureCommandHandler(self):
|
||||
self._zuul_app.smartReconfigure()
|
||||
|
||||
def start_repl(self):
|
||||
if self.repl:
|
||||
return
|
||||
@@ -546,9 +561,12 @@ class Scheduler(threading.Thread):
|
||||
self.repl.stop()
|
||||
self.repl = None
|
||||
|
||||
def reconfigure(self, config):
|
||||
def reconfigure(self, config, smart=False):
|
||||
self.log.debug("Submitting reconfiguration event")
|
||||
event = ReconfigureEvent(config)
|
||||
if smart:
|
||||
event = SmartReconfigureEvent(config)
|
||||
else:
|
||||
event = ReconfigureEvent(config)
|
||||
self.management_event_queue.put(event)
|
||||
self.wake_event.set()
|
||||
self.log.debug("Waiting for reconfiguration")
|
||||
@@ -811,6 +829,60 @@ class Scheduler(threading.Thread):
|
||||
self.log.info("Full reconfiguration complete (duration: %s seconds)",
|
||||
duration)
|
||||
|
||||
def _doSmartReconfigureEvent(self, event):
|
||||
# This is called in the scheduler loop after another thread submits
|
||||
# a request
|
||||
reconfigured_tenants = []
|
||||
with self.layout_lock:
|
||||
self.config = event.config
|
||||
self.log.info("Smart reconfiguration beginning")
|
||||
start = time.monotonic()
|
||||
|
||||
# Reload the ansible manager in case the default ansible version
|
||||
# changed.
|
||||
default_ansible_version = get_default(
|
||||
self.config, 'scheduler', 'default_ansible_version', None)
|
||||
self.ansible_manager = AnsibleManager(
|
||||
default_version=default_ansible_version)
|
||||
|
||||
loader = configloader.ConfigLoader(
|
||||
self.connections, self, self.merger,
|
||||
self._get_key_dir())
|
||||
tenant_config, script = self._checkTenantSourceConf(self.config)
|
||||
old_unparsed_abide = self.unparsed_abide
|
||||
self.unparsed_abide = loader.readConfig(
|
||||
tenant_config, from_script=script)
|
||||
|
||||
# We need to handle new and deleted tenants so we need to process
|
||||
# all tenants from the currently known and the new ones.
|
||||
tenant_names = {t for t in self.abide.tenants}
|
||||
tenant_names.update(self.unparsed_abide.known_tenants)
|
||||
for tenant_name in tenant_names:
|
||||
old_tenant = [x for x in old_unparsed_abide.tenants
|
||||
if x['name'] == tenant_name]
|
||||
new_tenant = [x for x in self.unparsed_abide.tenants
|
||||
if x['name'] == tenant_name]
|
||||
if old_tenant == new_tenant:
|
||||
continue
|
||||
|
||||
reconfigured_tenants.append(tenant_name)
|
||||
old_tenant = self.abide.tenants.get(tenant_name)
|
||||
if old_tenant is None:
|
||||
# If there is no old tenant, use a fake tenant with the
|
||||
# correct name
|
||||
old_tenant = Tenant(tenant_name)
|
||||
abide = loader.reloadTenant(
|
||||
self.abide, old_tenant, self.ansible_manager,
|
||||
self.unparsed_abide)
|
||||
|
||||
tenant = abide.tenants.get(tenant_name)
|
||||
if tenant is not None:
|
||||
self._reconfigureTenant(tenant)
|
||||
self.abide = abide
|
||||
duration = round(time.monotonic() - start, 3)
|
||||
self.log.info("Smart reconfiguration of tenants %s complete "
|
||||
"(duration: %s seconds)", reconfigured_tenants, duration)
|
||||
|
||||
def _doTenantReconfigureEvent(self, event):
|
||||
# This is called in the scheduler loop after another thread submits
|
||||
# a request
|
||||
@@ -1242,6 +1314,8 @@ class Scheduler(threading.Thread):
|
||||
try:
|
||||
if isinstance(event, ReconfigureEvent):
|
||||
self._doReconfigureEvent(event)
|
||||
if isinstance(event, SmartReconfigureEvent):
|
||||
self._doSmartReconfigureEvent(event)
|
||||
elif isinstance(event, TenantReconfigureEvent):
|
||||
self._doTenantReconfigureEvent(event)
|
||||
elif isinstance(event, PromoteEvent):
|
||||
|
||||
Reference in New Issue
Block a user