Add a tenant reconfiguration metric

This allows operators to see when tenant reconfiguration events are
processed, and how long they halt pipeline processing.

Change-Id: I30102b2d51ae98ade194722a7720ddec7eed4dad
This commit is contained in:
James E. Blair 2022-03-02 13:48:36 -08:00
parent c5f60e9004
commit bff3f8e4df
4 changed files with 29 additions and 5 deletions

View File

@ -70,6 +70,18 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
The size of the tenant's management event queue.
.. stat:: zuul.tenant.<tenant>.reconfiguration_time
:type: timer
A timer metric reporting the time taken to reconfigure a tenant.
This is performed by one scheduler after a tenant reconfiguration
event is received. During this time, all processing of that
tenant's pipelines are halted. This measures that time.
Once the first scheduler completes a tenant reconfiguration, other
schedulers may update their layout in the background without
interrupting processing. That is not reported in this metric.
.. stat:: zuul.tenant.<tenant>.trigger_events
:type: gauge

View File

@ -22,3 +22,4 @@ features:
* :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.write_bytes`
* :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.event_process`
* :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.handling`
* :stat:`zuul.tenant.<tenant>.reconfiguration_time`

View File

@ -443,6 +443,7 @@ class TestScheduler(ZuulTestCase):
for key in [
'zuul.tenant.tenant-one.event_enqueue_processing_time',
'zuul.tenant.tenant-one.event_enqueue_time',
'zuul.tenant.tenant-one.reconfiguration_time',
'zuul.tenant.tenant-one.pipeline.gate.event_enqueue_time',
'zuul.tenant.tenant-one.pipeline.gate.merge_request_time',
'zuul.tenant.tenant-one.pipeline.gate.job_freeze_time',

View File

@ -878,6 +878,7 @@ class Scheduler(threading.Thread):
with self.layout_lock:
for tenant_name in new_tenants:
stats_key = f'zuul.tenant.{tenant_name}'
layout_state = self.tenant_layout_state.get(tenant_name)
# In case we don't have a cached layout state we need to
# acquire the write lock since we load a new tenant.
@ -886,12 +887,15 @@ class Scheduler(threading.Thread):
# we are starting from an empty layout state and there
# should be no concurrent read locks.
lock_ctx = tenant_write_lock(self.zk_client, tenant_name)
timer_ctx = self.statsd_timer(
f'{stats_key}.reconfiguration_time')
else:
lock_ctx = tenant_read_lock(self.zk_client, tenant_name)
timer_ctx = nullcontext()
# Consider all caches valid (min. ltime -1)
min_ltimes = defaultdict(lambda: defaultdict(lambda: -1))
with lock_ctx as tlock:
with lock_ctx as tlock, timer_ctx:
# Refresh the layout state now that we are holding the lock
# and we can be sure it won't be changed concurrently.
layout_state = self.tenant_layout_state.get(tenant_name)
@ -1320,8 +1324,11 @@ class Scheduler(threading.Thread):
# Consider all project branch caches valid.
branch_cache_min_ltimes = defaultdict(lambda: -1)
with tenant_write_lock(self.zk_client, tenant_name,
identifier=RECONFIG_LOCK_ID) as lock:
stats_key = f'zuul.tenant.{tenant_name}'
with tenant_write_lock(
self.zk_client, tenant_name,
identifier=RECONFIG_LOCK_ID) as lock,\
self.statsd_timer(f'{stats_key}.reconfiguration_time'):
tenant = loader.loadTenant(
self.abide, tenant_name, self.ansible_manager,
self.unparsed_abide, min_ltimes=min_ltimes,
@ -1380,8 +1387,11 @@ class Scheduler(threading.Thread):
loader.loadTPCs(self.abide, self.unparsed_abide,
[event.tenant_name])
with tenant_write_lock(self.zk_client, event.tenant_name,
identifier=RECONFIG_LOCK_ID) as lock:
stats_key = f'zuul.tenant.{event.tenant_name}'
with tenant_write_lock(
self.zk_client, event.tenant_name,
identifier=RECONFIG_LOCK_ID) as lock,\
self.statsd_timer(f'{stats_key}.reconfiguration_time'):
loader.loadTenant(
self.abide, event.tenant_name, self.ansible_manager,
self.unparsed_abide, min_ltimes=min_ltimes,