Merge "Rework zuul nodepool stats reporting"

This commit is contained in:
Zuul 2018-12-10 04:42:44 +00:00 committed by Gerrit Code Review
commit a6ba568d72
5 changed files with 106 additions and 95 deletions

View File

@ -188,78 +188,69 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
The used RAM (excluding buffers and cache) on this executor, as The used RAM (excluding buffers and cache) on this executor, as
a percentage multiplied by 100. a percentage multiplied by 100.
.. stat:: zuul.nodepool .. stat:: zuul.nodepool.requests
Holds metrics related to Zuul requests from Nodepool. Holds metrics related to Zuul requests and responses from Nodepool.
.. stat:: requested States are one of:
*requested*
Node request submitted by Zuul to Nodepool
*canceled*
Node request was canceled by Zuul
*failed*
Nodepool failed to fulfill a node request
*fulfilled*
Nodes were assigned by Nodepool
.. stat:: <state>
:type: timer
Records the elapsed time from request to completion for states
`failed` and `fulfilled`. For example,
``zuul.nodepool.request.fulfilled.mean`` will give the average
time for all fulfilled requests within each ``statsd`` flush
interval.
A lower value for `fulfilled` requests is better. Ideally,
there will be no `failed` requests.
.. stat:: <state>.total
:type: counter :type: counter
Incremented each time a node request is submitted to Nodepool. Incremented when nodes are assigned or removed as described in
the states above.
.. stat:: label.<label> .. stat:: <state>.size.<size>
:type: counter
Incremented each time a request for a specific label is
submitted to Nodepool.
.. stat:: size.<size>
:type: counter
Incremented each time a request of a specific size is submitted
to Nodepool. For example, a request for 3 nodes would use the
key ``zuul.nodepool.requested.size.3``.
.. stat:: canceled
:type: counter, timer :type: counter, timer
The counter is incremented each time a node request is canceled Increments for the node count of each request. For example, a
by Zuul. The timer records the elapsed time from request to request for 3 nodes would use the key
cancelation. ``zuul.nodepool.requests.requested.size.3``; fulfillment of 3
node requests can be tracked with
``zuul.nodepool.requests.fulfilled.size.3``.
.. stat:: label.<label> The timer is implemented for ``fulfilled`` and ``failed``
:type: counter, timer requests. For example, the timer
``zuul.nodepool.requests.failed.size.3.mean`` gives the average
time of 3-node failed requests within the ``statsd`` flush
interval. A lower value for `fulfilled` requests is better.
Ideally, there will be no `failed` requests.
The same, for a specific label. .. stat:: <state>.label.<label>
.. stat:: size.<size>
:type: counter, timer
The same, for a specific request size.
.. stat:: fulfilled
:type: counter, timer :type: counter, timer
The counter is incremented each time a node request is fulfilled Increments for the label of each request. For example, requests
by Nodepool. The timer records the elapsed time from request to for `centos7` nodes could be tracked with
fulfillment. ``zuul.nodepool.requests.requested.centos7``.
.. stat:: label.<label> The timer is implemented for ``fulfilled`` and ``failed``
:type: counter, timer requests. For example, the timer
``zuul.nodepool.requests.fulfilled.label.centos7.mean`` gives
The same, for a specific label. the average time of ``centos7`` fulfilled requests within the
``statsd`` flush interval. A lower value for `fulfilled`
.. stat:: size.<size> requests is better. Ideally, there will be no `failed`
:type: counter, timer requests.
The same, for a specific request size.
.. stat:: failed
:type: counter, timer
The counter is incremented each time Nodepool fails to fulfill a
node request. The timer records the elapsed time from request
to failure.
.. stat:: label.<label>
:type: counter, timer
The same, for a specific label.
.. stat:: size.<size>
:type: counter, timer
The same, for a specific request size.
.. stat:: current_requests .. stat:: current_requests
:type: gauge :type: gauge

View File

@ -0,0 +1,10 @@
---
upgrade:
- |
The `zuul.nodepool` statistics have been moved under
`zuul.nodepool.requests` to allow sub-stats to work correctly.
For example `zuul.nodepool.requested` has become
`zuul.nodepool.requests.requested.total`. The previously missing
`label` and `size` counters are now available at
`zuul.nodepool.requests.<state>.<size|label>`. For more info see
the monitoring documentation.

View File

@ -1294,6 +1294,8 @@ class BuildHistory(object):
class FakeStatsd(threading.Thread): class FakeStatsd(threading.Thread):
log = logging.getLogger("zuul.test.FakeStatsd")
def __init__(self): def __init__(self):
threading.Thread.__init__(self) threading.Thread.__init__(self)
self.daemon = True self.daemon = True
@ -1314,6 +1316,7 @@ class FakeStatsd(threading.Thread):
data = self.sock.recvfrom(1024) data = self.sock.recvfrom(1024)
if not data: if not data:
return return
self.log.debug("Appending: %s" % data[0])
self.stats.append(data[0]) self.stats.append(data[0])
if fd == self.wake_read: if fd == self.wake_read:
return return

View File

@ -130,9 +130,6 @@ class TestScheduler(ZuulTestCase):
self.assertEqual(self.getJobFromHistory('project-test2').node, self.assertEqual(self.getJobFromHistory('project-test2').node,
'label1') 'label1')
for stat in self.statsd.stats:
k, v = stat.decode('utf-8').split(':')
self.log.debug('stat %s:%s', k, v)
# TODOv3(jeblair): we may want to report stats by tenant (also?). # TODOv3(jeblair): we may want to report stats by tenant (also?).
# Per-driver # Per-driver
self.assertReportedStat('zuul.event.gerrit.comment-added', value='1', self.assertReportedStat('zuul.event.gerrit.comment-added', value='1',
@ -164,23 +161,26 @@ class TestScheduler(ZuulTestCase):
exec_key = 'zuul.executor.%s' % self.executor_server.hostname.replace( exec_key = 'zuul.executor.%s' % self.executor_server.hostname.replace(
'.', '_') '.', '_')
self.assertReportedStat(exec_key + '.builds', value='1', kind='c') self.assertReportedStat(exec_key + '.builds', value='1', kind='c')
self.assertReportedStat('zuul.nodepool.requested', value='1', kind='c') self.assertReportedStat(
self.assertReportedStat('zuul.nodepool.requested.label.label1', 'zuul.nodepool.requests.requested.total', value='1', kind='c')
value='1', kind='c') self.assertReportedStat(
self.assertReportedStat('zuul.nodepool.fulfilled.label.label1', 'zuul.nodepool.requests.requested.label.label1',
value='1', kind='c') value='1', kind='c')
self.assertReportedStat('zuul.nodepool.requested.size.1', value='1', self.assertReportedStat(
kind='c') 'zuul.nodepool.requests.fulfilled.label.label1',
self.assertReportedStat('zuul.nodepool.fulfilled.size.1', value='1', value='1', kind='c')
kind='c') self.assertReportedStat(
self.assertReportedStat('zuul.nodepool.current_requests', value='1', 'zuul.nodepool.requests.requested.size.1', value='1', kind='c')
kind='g') self.assertReportedStat(
self.assertReportedStat('zuul.executors.online', value='1', 'zuul.nodepool.requests.fulfilled.size.1', value='1', kind='c')
kind='g') self.assertReportedStat(
self.assertReportedStat('zuul.executors.accepting', value='1', 'zuul.nodepool.current_requests', value='1', kind='g')
kind='g') self.assertReportedStat(
self.assertReportedStat('zuul.mergers.online', value='1', 'zuul.executors.online', value='1', kind='g')
kind='g') self.assertReportedStat(
'zuul.executors.accepting', value='1', kind='g')
self.assertReportedStat(
'zuul.mergers.online', value='1', kind='g')
for build in self.history: for build in self.history:
self.assertTrue(build.parameters['zuul']['voting']) self.assertTrue(build.parameters['zuul']['voting'])

View File

@ -24,33 +24,40 @@ class Nodepool(object):
self.sched = scheduler self.sched = scheduler
def emitStats(self, request): def emitStats(self, request):
# Implements the following :
# counter zuul.nodepool.requests.<state>.total
# counter zuul.nodepool.requests.<state>.label.<label>
# counter zuul.nodepool.requests.<state>.size.<size>
# timer zuul.nodepool.requests.(fulfilled|failed)
# timer zuul.nodepool.requests.(fulfilled|failed).<label>
# timer zuul.nodepool.requests.(fulfilled|failed).<size>
# gauge zuul.nodepool.current_requests
if not self.sched.statsd: if not self.sched.statsd:
return return
statsd = self.sched.statsd statsd = self.sched.statsd
# counter zuul.nodepool.requested pipe = statsd.pipeline()
# counter zuul.nodepool.requested.label.<label>
# counter zuul.nodepool.requested.size.<size>
# gauge zuul.nodepool.current_requests
state = request.state state = request.state
dt = None
if request.canceled: if request.canceled:
state = 'canceled' state = 'canceled'
dt = None
elif request.state in (model.STATE_FULFILLED, model.STATE_FAILED): elif request.state in (model.STATE_FULFILLED, model.STATE_FAILED):
dt = int((request.state_time - request.requested_time) * 1000) dt = int((request.state_time - request.requested_time) * 1000)
else:
dt = None key = 'zuul.nodepool.requests.%s' % state
key = 'zuul.nodepool.%s' % state pipe.incr(key + ".total")
statsd.incr(key)
if dt: if dt:
statsd.timing(key, dt) pipe.timing(key, dt)
for node in request.nodeset.getNodes(): for node in request.nodeset.getNodes():
statsd.incr(key + '.label.%s' % node.label) pipe.incr(key + '.label.%s' % node.label)
if dt: if dt:
statsd.timing(key + '.label.%s' % node.label, dt) pipe.timing(key + '.label.%s' % node.label, dt)
statsd.incr(key + '.size.%s' % len(request.nodeset.nodes)) pipe.incr(key + '.size.%s' % len(request.nodeset.nodes))
if dt: if dt:
statsd.timing(key + '.size.%s' % len(request.nodeset.nodes), dt) pipe.timing(key + '.size.%s' % len(request.nodeset.nodes), dt)
statsd.gauge('zuul.nodepool.current_requests', len(self.requests)) pipe.gauge('zuul.nodepool.current_requests', len(self.requests))
pipe.send()
def requestNodes(self, build_set, job, relative_priority): def requestNodes(self, build_set, job, relative_priority):
# Create a copy of the nodeset to represent the actual nodes # Create a copy of the nodeset to represent the actual nodes