Browse Source

Merge "Rework zuul nodepool stats reporting"

changes/17/591917/9
Zuul 3 years ago
committed by Gerrit Code Review
parent
commit
a6ba568d72
5 changed files with 103 additions and 92 deletions
  1. +48
    -57
      doc/source/admin/monitoring.rst
  2. +10
    -0
      releasenotes/notes/nodepool-statsd-3eb500893833cdc4.yaml
  3. +3
    -0
      tests/base.py
  4. +20
    -20
      tests/unit/test_scheduler.py
  5. +22
    -15
      zuul/nodepool.py

+ 48
- 57
doc/source/admin/monitoring.rst View File

@ -188,78 +188,69 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
The used RAM (excluding buffers and cache) on this executor, as
a percentage multiplied by 100.
.. stat:: zuul.nodepool
.. stat:: zuul.nodepool.requests
Holds metrics related to Zuul requests from Nodepool.
Holds metrics related to Zuul requests and responses from Nodepool.
.. stat:: requested
:type: counter
Incremented each time a node request is submitted to Nodepool.
.. stat:: label.<label>
:type: counter
Incremented each time a request for a specific label is
submitted to Nodepool.
States are one of:
.. stat:: size.<size>
:type: counter
Incremented each time a request of a specific size is submitted
to Nodepool. For example, a request for 3 nodes would use the
key ``zuul.nodepool.requested.size.3``.
.. stat:: canceled
:type: counter, timer
*requested*
Node request submitted by Zuul to Nodepool
*canceled*
Node request was canceled by Zuul
*failed*
Nodepool failed to fulfill a node request
*fulfilled*
Nodes were assigned by Nodepool
The counter is incremented each time a node request is canceled
by Zuul. The timer records the elapsed time from request to
cancelation.
.. stat:: <state>
:type: timer
.. stat:: label.<label>
:type: counter, timer
Records the elapsed time from request to completion for states
`failed` and `fulfilled`. For example,
``zuul.nodepool.request.fulfilled.mean`` will give the average
time for all fulfilled requests within each ``statsd`` flush
interval.
The same, for a specific label.
A lower value for `fulfilled` requests is better. Ideally,
there will be no `failed` requests.
.. stat:: size.<size>
:type: counter, timer
.. stat:: <state>.total
:type: counter
The same, for a specific request size.
Incremented when nodes are assigned or removed as described in
the states above.
.. stat:: fulfilled
.. stat:: <state>.size.<size>
:type: counter, timer
The counter is incremented each time a node request is fulfilled
by Nodepool. The timer records the elapsed time from request to
fulfillment.
.. stat:: label.<label>
:type: counter, timer
The same, for a specific label.
.. stat:: size.<size>
:type: counter, timer
Increments for the node count of each request. For example, a
request for 3 nodes would use the key
``zuul.nodepool.requests.requested.size.3``; fulfillment of 3
node requests can be tracked with
``zuul.nodepool.requests.fulfilled.size.3``.
The same, for a specific request size.
The timer is implemented for ``fulfilled`` and ``failed``
requests. For example, the timer
``zuul.nodepool.requests.failed.size.3.mean`` gives the average
time of 3-node failed requests within the ``statsd`` flush
interval. A lower value for `fulfilled` requests is better.
Ideally, there will be no `failed` requests.
.. stat:: failed
.. stat:: <state>.label.<label>
:type: counter, timer
The counter is incremented each time Nodepool fails to fulfill a
node request. The timer records the elapsed time from request
to failure.
.. stat:: label.<label>
:type: counter, timer
The same, for a specific label.
.. stat:: size.<size>
:type: counter, timer
The same, for a specific request size.
Increments for the label of each request. For example, requests
for `centos7` nodes could be tracked with
``zuul.nodepool.requests.requested.centos7``.
The timer is implemented for ``fulfilled`` and ``failed``
requests. For example, the timer
``zuul.nodepool.requests.fulfilled.label.centos7.mean`` gives
the average time of ``centos7`` fulfilled requests within the
``statsd`` flush interval. A lower value for `fulfilled`
requests is better. Ideally, there will be no `failed`
requests.
.. stat:: current_requests
:type: gauge


+ 10
- 0
releasenotes/notes/nodepool-statsd-3eb500893833cdc4.yaml View File

@ -0,0 +1,10 @@
---
upgrade:
- |
The `zuul.nodepool` statistics have been moved under
`zuul.nodepool.requests` to allow sub-stats to work correctly.
For example `zuul.nodepool.requested` has become
`zuul.nodepool.requests.requested.total`. The previously missing
`label` and `size` counters are now available at
`zuul.nodepool.requests.<state>.<size|label>`. For more info see
the monitoring documentation.

+ 3
- 0
tests/base.py View File

@ -1294,6 +1294,8 @@ class BuildHistory(object):
class FakeStatsd(threading.Thread):
log = logging.getLogger("zuul.test.FakeStatsd")
def __init__(self):
threading.Thread.__init__(self)
self.daemon = True
@ -1314,6 +1316,7 @@ class FakeStatsd(threading.Thread):
data = self.sock.recvfrom(1024)
if not data:
return
self.log.debug("Appending: %s" % data[0])
self.stats.append(data[0])
if fd == self.wake_read:
return


+ 20
- 20
tests/unit/test_scheduler.py View File

@ -130,9 +130,6 @@ class TestScheduler(ZuulTestCase):
self.assertEqual(self.getJobFromHistory('project-test2').node,
'label1')
for stat in self.statsd.stats:
k, v = stat.decode('utf-8').split(':')
self.log.debug('stat %s:%s', k, v)
# TODOv3(jeblair): we may want to report stats by tenant (also?).
# Per-driver
self.assertReportedStat('zuul.event.gerrit.comment-added', value='1',
@ -164,23 +161,26 @@ class TestScheduler(ZuulTestCase):
exec_key = 'zuul.executor.%s' % self.executor_server.hostname.replace(
'.', '_')
self.assertReportedStat(exec_key + '.builds', value='1', kind='c')
self.assertReportedStat('zuul.nodepool.requested', value='1', kind='c')
self.assertReportedStat('zuul.nodepool.requested.label.label1',
value='1', kind='c')
self.assertReportedStat('zuul.nodepool.fulfilled.label.label1',
value='1', kind='c')
self.assertReportedStat('zuul.nodepool.requested.size.1', value='1',
kind='c')
self.assertReportedStat('zuul.nodepool.fulfilled.size.1', value='1',
kind='c')
self.assertReportedStat('zuul.nodepool.current_requests', value='1',
kind='g')
self.assertReportedStat('zuul.executors.online', value='1',
kind='g')
self.assertReportedStat('zuul.executors.accepting', value='1',
kind='g')
self.assertReportedStat('zuul.mergers.online', value='1',
kind='g')
self.assertReportedStat(
'zuul.nodepool.requests.requested.total', value='1', kind='c')
self.assertReportedStat(
'zuul.nodepool.requests.requested.label.label1',
value='1', kind='c')
self.assertReportedStat(
'zuul.nodepool.requests.fulfilled.label.label1',
value='1', kind='c')
self.assertReportedStat(
'zuul.nodepool.requests.requested.size.1', value='1', kind='c')
self.assertReportedStat(
'zuul.nodepool.requests.fulfilled.size.1', value='1', kind='c')
self.assertReportedStat(
'zuul.nodepool.current_requests', value='1', kind='g')
self.assertReportedStat(
'zuul.executors.online', value='1', kind='g')
self.assertReportedStat(
'zuul.executors.accepting', value='1', kind='g')
self.assertReportedStat(
'zuul.mergers.online', value='1', kind='g')
for build in self.history:
self.assertTrue(build.parameters['zuul']['voting'])


+ 22
- 15
zuul/nodepool.py View File

@ -24,33 +24,40 @@ class Nodepool(object):
self.sched = scheduler
def emitStats(self, request):
# Implements the following :
# counter zuul.nodepool.requests.<state>.total
# counter zuul.nodepool.requests.<state>.label.<label>
# counter zuul.nodepool.requests.<state>.size.<size>
# timer zuul.nodepool.requests.(fulfilled|failed)
# timer zuul.nodepool.requests.(fulfilled|failed).<label>
# timer zuul.nodepool.requests.(fulfilled|failed).<size>
# gauge zuul.nodepool.current_requests
if not self.sched.statsd:
return
statsd = self.sched.statsd
# counter zuul.nodepool.requested
# counter zuul.nodepool.requested.label.<label>
# counter zuul.nodepool.requested.size.<size>
# gauge zuul.nodepool.current_requests
pipe = statsd.pipeline()
state = request.state
dt = None
if request.canceled:
state = 'canceled'
dt = None
elif request.state in (model.STATE_FULFILLED, model.STATE_FAILED):
dt = int((request.state_time - request.requested_time) * 1000)
else:
dt = None
key = 'zuul.nodepool.%s' % state
statsd.incr(key)
key = 'zuul.nodepool.requests.%s' % state
pipe.incr(key + ".total")
if dt:
statsd.timing(key, dt)
pipe.timing(key, dt)
for node in request.nodeset.getNodes():
statsd.incr(key + '.label.%s' % node.label)
pipe.incr(key + '.label.%s' % node.label)
if dt:
statsd.timing(key + '.label.%s' % node.label, dt)
statsd.incr(key + '.size.%s' % len(request.nodeset.nodes))
pipe.timing(key + '.label.%s' % node.label, dt)
pipe.incr(key + '.size.%s' % len(request.nodeset.nodes))
if dt:
statsd.timing(key + '.size.%s' % len(request.nodeset.nodes), dt)
statsd.gauge('zuul.nodepool.current_requests', len(self.requests))
pipe.timing(key + '.size.%s' % len(request.nodeset.nodes), dt)
pipe.gauge('zuul.nodepool.current_requests', len(self.requests))
pipe.send()
def requestNodes(self, build_set, job, relative_priority):
# Create a copy of the nodeset to represent the actual nodes


Loading…
Cancel
Save