diff --git a/doc/source/monitoring.rst b/doc/source/monitoring.rst index 57f4a2534c..1565d04964 100644 --- a/doc/source/monitoring.rst +++ b/doc/source/monitoring.rst @@ -70,6 +70,18 @@ These metrics are emitted by the Zuul :ref:`scheduler`: The size of the tenant's management event queue. +.. stat:: zuul.tenant..reconfiguration_time + :type: timer + + A timer metric reporting the time taken to reconfigure a tenant. + This is performed by one scheduler after a tenant reconfiguration + event is received. During this time, all processing of that + tenant's pipelines are halted. This measures that time. + + Once the first scheduler completes a tenant reconfiguration, other + schedulers may update their layout in the background without + interrupting processing. That is not reported in this metric. + .. stat:: zuul.tenant..trigger_events :type: gauge diff --git a/releasenotes/notes/pipeline-timing-ea263e6e5939b1aa.yaml b/releasenotes/notes/pipeline-timing-ea263e6e5939b1aa.yaml index 7cd68727a1..c00c6ea144 100644 --- a/releasenotes/notes/pipeline-timing-ea263e6e5939b1aa.yaml +++ b/releasenotes/notes/pipeline-timing-ea263e6e5939b1aa.yaml @@ -22,3 +22,4 @@ features: * :stat:`zuul.tenant..pipeline..write_bytes` * :stat:`zuul.tenant..pipeline..event_process` * :stat:`zuul.tenant..pipeline..handling` + * :stat:`zuul.tenant..reconfiguration_time` diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py index 4336261b9b..4b747009fa 100644 --- a/tests/unit/test_scheduler.py +++ b/tests/unit/test_scheduler.py @@ -443,6 +443,7 @@ class TestScheduler(ZuulTestCase): for key in [ 'zuul.tenant.tenant-one.event_enqueue_processing_time', 'zuul.tenant.tenant-one.event_enqueue_time', + 'zuul.tenant.tenant-one.reconfiguration_time', 'zuul.tenant.tenant-one.pipeline.gate.event_enqueue_time', 'zuul.tenant.tenant-one.pipeline.gate.merge_request_time', 'zuul.tenant.tenant-one.pipeline.gate.job_freeze_time', diff --git a/zuul/scheduler.py b/zuul/scheduler.py index efa904ce9e..cc5fe1ae1d 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -878,6 +878,7 @@ class Scheduler(threading.Thread): with self.layout_lock: for tenant_name in new_tenants: + stats_key = f'zuul.tenant.{tenant_name}' layout_state = self.tenant_layout_state.get(tenant_name) # In case we don't have a cached layout state we need to # acquire the write lock since we load a new tenant. @@ -886,12 +887,15 @@ class Scheduler(threading.Thread): # we are starting from an empty layout state and there # should be no concurrent read locks. lock_ctx = tenant_write_lock(self.zk_client, tenant_name) + timer_ctx = self.statsd_timer( + f'{stats_key}.reconfiguration_time') else: lock_ctx = tenant_read_lock(self.zk_client, tenant_name) + timer_ctx = nullcontext() # Consider all caches valid (min. ltime -1) min_ltimes = defaultdict(lambda: defaultdict(lambda: -1)) - with lock_ctx as tlock: + with lock_ctx as tlock, timer_ctx: # Refresh the layout state now that we are holding the lock # and we can be sure it won't be changed concurrently. layout_state = self.tenant_layout_state.get(tenant_name) @@ -1320,8 +1324,11 @@ class Scheduler(threading.Thread): # Consider all project branch caches valid. branch_cache_min_ltimes = defaultdict(lambda: -1) - with tenant_write_lock(self.zk_client, tenant_name, - identifier=RECONFIG_LOCK_ID) as lock: + stats_key = f'zuul.tenant.{tenant_name}' + with tenant_write_lock( + self.zk_client, tenant_name, + identifier=RECONFIG_LOCK_ID) as lock,\ + self.statsd_timer(f'{stats_key}.reconfiguration_time'): tenant = loader.loadTenant( self.abide, tenant_name, self.ansible_manager, self.unparsed_abide, min_ltimes=min_ltimes, @@ -1380,8 +1387,11 @@ class Scheduler(threading.Thread): loader.loadTPCs(self.abide, self.unparsed_abide, [event.tenant_name]) - with tenant_write_lock(self.zk_client, event.tenant_name, - identifier=RECONFIG_LOCK_ID) as lock: + stats_key = f'zuul.tenant.{event.tenant_name}' + with tenant_write_lock( + self.zk_client, event.tenant_name, + identifier=RECONFIG_LOCK_ID) as lock,\ + self.statsd_timer(f'{stats_key}.reconfiguration_time'): loader.loadTenant( self.abide, event.tenant_name, self.ansible_manager, self.unparsed_abide, min_ltimes=min_ltimes,