Add a tenant reconfiguration metric

This allows operators to see when tenant reconfiguration events are processed, and how long they halt pipeline processing. Change-Id: I30102b2d51ae98ade194722a7720ddec7eed4dad
2022-03-02 13:48:36 -08:00 · 2022-03-02 13:48:36 -08:00 · bff3f8e4df
parent c5f60e9004
commit bff3f8e4df
4 changed files with 29 additions and 5 deletions
--- a/doc/source/monitoring.rst
+++ b/doc/source/monitoring.rst
@ -70,6 +70,18 @@ These metrics are emitted by the Zuul :ref:`scheduler`:

   The size of the tenant's management event queue.

+.. stat:: zuul.tenant.<tenant>.reconfiguration_time
+   :type: timer
+
+   A timer metric reporting the time taken to reconfigure a tenant.
+   This is performed by one scheduler after a tenant reconfiguration
+   event is received.  During this time, all processing of that
+   tenant's pipelines are halted.  This measures that time.
+
+   Once the first scheduler completes a tenant reconfiguration, other
+   schedulers may update their layout in the background without
+   interrupting processing.  That is not reported in this metric.
+
 .. stat:: zuul.tenant.<tenant>.trigger_events
   :type: gauge

--- a/releasenotes/notes/pipeline-timing-ea263e6e5939b1aa.yaml
+++ b/releasenotes/notes/pipeline-timing-ea263e6e5939b1aa.yaml
@ -22,3 +22,4 @@ features:
    * :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.write_bytes`
    * :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.event_process`
    * :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.handling`
+    * :stat:`zuul.tenant.<tenant>.reconfiguration_time`
--- a/tests/unit/test_scheduler.py
+++ b/tests/unit/test_scheduler.py
@ -443,6 +443,7 @@ class TestScheduler(ZuulTestCase):
        for key in [
                'zuul.tenant.tenant-one.event_enqueue_processing_time',
                'zuul.tenant.tenant-one.event_enqueue_time',
+                'zuul.tenant.tenant-one.reconfiguration_time',
                'zuul.tenant.tenant-one.pipeline.gate.event_enqueue_time',
                'zuul.tenant.tenant-one.pipeline.gate.merge_request_time',
                'zuul.tenant.tenant-one.pipeline.gate.job_freeze_time',
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@ -878,6 +878,7 @@ class Scheduler(threading.Thread):

        with self.layout_lock:
            for tenant_name in new_tenants:
+                stats_key = f'zuul.tenant.{tenant_name}'
                layout_state = self.tenant_layout_state.get(tenant_name)
                # In case we don't have a cached layout state we need to
                # acquire the write lock since we load a new tenant.
@ -886,12 +887,15 @@ class Scheduler(threading.Thread):
                    # we are starting from an empty layout state and there
                    # should be no concurrent read locks.
                    lock_ctx = tenant_write_lock(self.zk_client, tenant_name)
+                    timer_ctx = self.statsd_timer(
+                        f'{stats_key}.reconfiguration_time')
                else:
                    lock_ctx = tenant_read_lock(self.zk_client, tenant_name)
+                    timer_ctx = nullcontext()

                # Consider all caches valid (min. ltime -1)
                min_ltimes = defaultdict(lambda: defaultdict(lambda: -1))
-                with lock_ctx as tlock:
+                with lock_ctx as tlock, timer_ctx:
                    # Refresh the layout state now that we are holding the lock
                    # and we can be sure it won't be changed concurrently.
                    layout_state = self.tenant_layout_state.get(tenant_name)
@ -1320,8 +1324,11 @@ class Scheduler(threading.Thread):
                # Consider all project branch caches valid.
                branch_cache_min_ltimes = defaultdict(lambda: -1)

-                with tenant_write_lock(self.zk_client, tenant_name,
-                                       identifier=RECONFIG_LOCK_ID) as lock:
+                stats_key = f'zuul.tenant.{tenant_name}'
+                with tenant_write_lock(
+                        self.zk_client, tenant_name,
+                        identifier=RECONFIG_LOCK_ID) as lock,\
+                        self.statsd_timer(f'{stats_key}.reconfiguration_time'):
                    tenant = loader.loadTenant(
                        self.abide, tenant_name, self.ansible_manager,
                        self.unparsed_abide, min_ltimes=min_ltimes,
@ -1380,8 +1387,11 @@ class Scheduler(threading.Thread):
            loader.loadTPCs(self.abide, self.unparsed_abide,
                            [event.tenant_name])

-            with tenant_write_lock(self.zk_client, event.tenant_name,
-                                   identifier=RECONFIG_LOCK_ID) as lock:
+            stats_key = f'zuul.tenant.{event.tenant_name}'
+            with tenant_write_lock(
+                    self.zk_client, event.tenant_name,
+                    identifier=RECONFIG_LOCK_ID) as lock,\
+                    self.statsd_timer(f'{stats_key}.reconfiguration_time'):
                loader.loadTenant(
                    self.abide, event.tenant_name, self.ansible_manager,
                    self.unparsed_abide, min_ltimes=min_ltimes,