Add more pipeline processing stats

This adds the number of zk objects, nodes, and bytes read and written during each pipeline processing run. This can help Zuul developers ascertain where to optimize performance. Change-Id: Ic2592faeb08d6c2a72b99000864c41ada665cd3b
2022-02-28 10:51:37 -08:00 · 2022-02-28 10:51:37 -08:00 · 88b076e8e3
commit 88b076e8e3
parent 72e6234157
6 changed files with 106 additions and 5 deletions
--- a/doc/source/monitoring.rst
+++ b/doc/source/monitoring.rst
@ -235,6 +235,24 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
         The time spent reading data from ZooKeeper during a single
         pipeline processing run.

+      .. stat:: read_znodes
+         :type: gauge
+
+         The number of ZNodes read from ZooKeeper during a single
+         pipeline processing run.
+
+      .. stat:: read_objects
+         :type: gauge
+
+         The number of Zuul data model objects read from ZooKeeper
+         during a single pipeline processing run.
+
+      .. stat:: read_bytes
+         :type: gauge
+
+         The amount of data read from ZooKeeper during a single
+         pipeline processing run.
+
      .. stat:: refresh
         :type: timer

@ -273,6 +291,24 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
         The time spent writing data to ZooKeeper during a single
         pipeline processing run.

+      .. stat:: write_znodes
+         :type: gauge
+
+         The number of ZNodes written to ZooKeeper during a single
+         pipeline processing run.
+
+      .. stat:: write_objects
+         :type: gauge
+
+         The number of Zuul data model objects written to ZooKeeper
+         during a single pipeline processing run.
+
+      .. stat:: write_bytes
+         :type: gauge
+
+         The amount of data written to ZooKeeper during a single
+         pipeline processing run.
+
 .. stat:: zuul.executor.<executor>

   Holds metrics emitted by individual executors.  The ``<executor>``
--- a/releasenotes/notes/pipeline-timing-ea263e6e5939b1aa.yaml
+++ b/releasenotes/notes/pipeline-timing-ea263e6e5939b1aa.yaml
@ -14,3 +14,9 @@ features:
    * :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.event_job_time`
    * :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.read_time`
    * :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.write_time`
+    * :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.read_objects`
+    * :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.write_objects`
+    * :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.read_znodes`
+    * :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.write_znodes`
+    * :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.read_bytes`
+    * :stat:`zuul.tenant.<tenant>.pipeline.<pipeline>.write_bytes`
--- a/tests/unit/test_scheduler.py
+++ b/tests/unit/test_scheduler.py
@ -456,6 +456,17 @@ class TestScheduler(ZuulTestCase):
            val = self.assertReportedStat(key, kind='ms')
            self.assertTrue(0.0 < float(val) < 60000.0)

+        for key in [
+                'zuul.tenant.tenant-one.pipeline.gate.read_objects',
+                'zuul.tenant.tenant-one.pipeline.gate.write_objects',
+                'zuul.tenant.tenant-one.pipeline.gate.read_znodes',
+                'zuul.tenant.tenant-one.pipeline.gate.write_znodes',
+                'zuul.tenant.tenant-one.pipeline.gate.read_bytes',
+                'zuul.tenant.tenant-one.pipeline.gate.write_bytes',
+        ]:
+            val = self.assertReportedStat(key, kind='g')
+            self.assertTrue(0.0 < float(val) < 60000.0)
+
        self.assertReportedStat('zuul.tenant.tenant-one.pipeline.gate.'
                                'data_size_compressed',
                                kind='g')
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@ -1927,11 +1927,7 @@ class Scheduler(threading.Thread):
                    if refreshed:
                        pipeline.summary.update(ctx, self.globals)
                        if self.statsd:
-                            self.statsd.timing(f'{stats_key}.read_time',
-                                               ctx.cumulative_read_time * 1000)
-                            self.statsd.timing(
-                                f'{stats_key}.write_time',
-                                ctx.cumulative_write_time * 1000)
+                            self._contextStats(ctx, stats_key)
            except LockException:
                self.log.debug("Skipping locked pipeline %s in tenant %s",
                               pipeline.name, tenant.name)
@ -1940,6 +1936,24 @@ class Scheduler(threading.Thread):
                    "Exception processing pipeline %s in tenant %s",
                    pipeline.name, tenant.name)

+    def _contextStats(self, ctx, stats_key):
+        self.statsd.timing(f'{stats_key}.read_time',
+                           ctx.cumulative_read_time * 1000)
+        self.statsd.timing(f'{stats_key}.write_time',
+                           ctx.cumulative_write_time * 1000)
+        self.statsd.gauge(f'{stats_key}.read_objects',
+                          ctx.cumulative_read_objects)
+        self.statsd.gauge(f'{stats_key}.write_objects',
+                          ctx.cumulative_write_objects)
+        self.statsd.gauge(f'{stats_key}.read_znodes',
+                          ctx.cumulative_read_znodes)
+        self.statsd.gauge(f'{stats_key}.write_znodes',
+                          ctx.cumulative_write_znodes)
+        self.statsd.gauge(f'{stats_key}.read_bytes',
+                          ctx.cumulative_read_bytes)
+        self.statsd.gauge(f'{stats_key}.write_bytes',
+                          ctx.cumulative_write_bytes)
+
    def _process_pipeline(self, tenant, pipeline):
        # Return whether or not we refreshed the pipeline.

--- a/zuul/zk/sharding.py
+++ b/zuul/zk/sharding.py
@ -33,6 +33,8 @@ class RawShardIO(io.RawIOBase):
        self.compressed_bytes_written = 0
        self.cumulative_read_time = 0.0
        self.cumulative_write_time = 0.0
+        self.znodes_read = 0
+        self.znodes_written = 0

    def readable(self):
        return True
@ -61,6 +63,7 @@ class RawShardIO(io.RawIOBase):
        data, _ = self.client.get(path)
        self.cumulative_read_time += time.perf_counter() - start
        self.compressed_bytes_read += len(data)
+        self.znodes_read += 1
        return zlib.decompress(data)

    def readall(self):
@ -86,6 +89,7 @@ class RawShardIO(io.RawIOBase):
        )
        self.cumulative_write_time += time.perf_counter() - start
        self.compressed_bytes_written += len(shard_bytes)
+        self.znodes_written += 1
        return min(byte_count, NODE_BYTE_SIZE_LIMIT)


@ -102,6 +106,10 @@ class BufferedShardWriter(io.BufferedWriter):
    def cumulative_write_time(self):
        return self.__raw.cumulative_write_time

+    @property
+    def znodes_written(self):
+        return self.__raw.znodes_written
+

 class BufferedShardReader(io.BufferedReader):
    def __init__(self, client, path):
@ -115,3 +123,7 @@ class BufferedShardReader(io.BufferedReader):
    @property
    def cumulative_read_time(self):
        return self.__raw.cumulative_read_time
+
+    @property
+    def znodes_read(self):
+        return self.__raw.znodes_read
--- a/zuul/zk/zkobject.py
+++ b/zuul/zk/zkobject.py
@ -33,6 +33,12 @@ class ZKContext:
        self.log = log
        self.cumulative_read_time = 0.0
        self.cumulative_write_time = 0.0
+        self.cumulative_read_objects = 0
+        self.cumulative_write_objects = 0
+        self.cumulative_read_znodes = 0
+        self.cumulative_write_znodes = 0
+        self.cumulative_read_bytes = 0
+        self.cumulative_write_bytes = 0

    def sessionIsValid(self):
        return ((not self.lock or self.lock.is_still_valid()) and
@ -242,6 +248,9 @@ class ZKObject:
                start = time.perf_counter()
                compressed_data, zstat = context.client.get(path)
                context.cumulative_read_time += time.perf_counter() - start
+                context.cumulative_read_objects += 1
+                context.cumulative_read_znodes += 1
+                context.cumulative_read_bytes += len(compressed_data)

                self._set(_zkobject_hash=None)
                try:
@ -292,6 +301,10 @@ class ZKObject:
                    zstat = context.client.set(path, compressed_data,
                                               version=self._zstat.version)
                context.cumulative_write_time += time.perf_counter() - start
+                context.cumulative_write_objects += 1
+                context.cumulative_write_znodes += 1
+                context.cumulative_write_bytes += len(compressed_data)
+
                self._set(_zstat=zstat,
                          _zkobject_hash=hash(data),
                          _zkobject_compressed_size=len(compressed_data),
@ -345,6 +358,11 @@ class ShardedZKObject(ZKObject):
                    compressed_size = stream.compressed_bytes_read
                    context.cumulative_read_time += \
                        stream.cumulative_read_time
+                    context.cumulative_read_objects += 1
+                    context.cumulative_read_znodes += \
+                        stream.znodes_read
+                    context.cumulative_read_bytes += compressed_size
+
                if not data and context.client.exists(path) is None:
                    raise NoNodeError
                self._set(**self.deserialize(data, context))
@ -393,6 +411,10 @@ class ShardedZKObject(ZKObject):
                    compressed_size = stream.compressed_bytes_written
                    context.cumulative_write_time += \
                        stream.cumulative_write_time
+                    context.cumulative_write_objects += 1
+                    context.cumulative_write_znodes += \
+                        stream.znodes_written
+                    context.cumulative_write_bytes += compressed_size

                self._set(_zkobject_hash=hash(data),
                          _zkobject_compressed_size=compressed_size,