Browse Source

Merge "Emit some nodepool stats" into feature/zuulv3

changes/84/512084/1
Zuul 5 years ago committed by Gerrit Code Review
parent
commit
4ab54b2d59
  1. 78
      doc/source/admin/monitoring.rst
  2. 1
      tests/unit/test_nodepool.py
  3. 8
      tests/unit/test_scheduler.py
  4. 1
      zuul/model.py
  5. 33
      zuul/nodepool.py

78
doc/source/admin/monitoring.rst

@ -146,6 +146,84 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
The one-minute load average of this executor, multiplied by 100.
.. stat:: zuul.nodepool
Holds metrics related to Zuul requests from Nodepool.
.. stat:: requested
:type: counter
Incremented each time a node request is submitted to Nodepool.
.. stat:: label.<label>
:type: counter
Incremented each time a request for a specific label is
submitted to Nodepool.
.. stat:: size.<size>
:type: counter
Incremented each time a request of a specific size is submitted
to Nodepool. For example, a request for 3 nodes would use the
key ``zuul.nodepool.requested.size.3``.
.. stat:: canceled
:type: counter, timer
The counter is incremented each time a node request is canceled
by Zuul. The timer records the elapsed time from request to
cancelation.
.. stat:: label.<label>
:type: counter, timer
The same, for a specific label.
.. stat:: size.<size>
:type: counter, timer
The same, for a specific request size.
.. stat:: fulfilled
:type: counter, timer
The counter is incremented each time a node request is fulfilled
by Nodepool. The timer records the elapsed time from request to
fulfillment.
.. stat:: label.<label>
:type: counter, timer
The same, for a specific label.
.. stat:: size.<size>
:type: counter, timer
The same, for a specific request size.
.. stat:: failed
:type: counter, timer
The counter is incremented each time Nodepool fails to fulfill a
node request. The timer records the elapsed time from request
to failure.
.. stat:: label.<label>
:type: counter, timer
The same, for a specific label.
.. stat:: size.<size>
:type: counter, timer
The same, for a specific request size.
.. stat:: current_requests
:type: gauge
The number of outstanding nodepool requests from Zuul.
As an example, given a job named `myjob` in `mytenant` triggered by a
change to `myproject` on the `master` branch in the `gate` pipeline

1
tests/unit/test_nodepool.py

@ -29,6 +29,7 @@ class TestNodepool(BaseTestCase):
def setUp(self):
super(TestNodepool, self).setUp()
self.statsd = None
self.zk_chroot_fixture = self.useFixture(
ChrootedKazooFixture(self.id()))
self.zk_config = '%s:%s%s' % (

8
tests/unit/test_scheduler.py

@ -119,6 +119,14 @@ class TestScheduler(ZuulTestCase):
'org_project.master.total_changes', value='1|c')
exec_key = 'zuul.executor.%s' % self.executor_server.hostname
self.assertReportedStat(exec_key + '.builds', value='1|c')
self.assertReportedStat('zuul.nodepool.requested', value='1|c')
self.assertReportedStat('zuul.nodepool.requested.label.label1',
value='1|c')
self.assertReportedStat('zuul.nodepool.fulfilled.label.label1',
value='1|c')
self.assertReportedStat('zuul.nodepool.requested.size.1', value='1|c')
self.assertReportedStat('zuul.nodepool.fulfilled.size.1', value='1|c')
self.assertReportedStat('zuul.nodepool.current_requests', value='1|g')
for build in self.history:
self.assertTrue(build.parameters['zuul']['voting'])

1
zuul/model.py

@ -523,6 +523,7 @@ class NodeRequest(object):
self.job = job
self.nodeset = nodeset
self._state = STATE_REQUESTED
self.requested_time = time.time()
self.state_time = time.time()
self.stat = None
self.uid = uuid4().hex

33
zuul/nodepool.py

@ -22,6 +22,35 @@ class Nodepool(object):
self.requests = {}
self.sched = scheduler
def emitStats(self, request):
if not self.sched.statsd:
return
statsd = self.sched.statsd
# counter zuul.nodepool.requested
# counter zuul.nodepool.requested.label.<label>
# counter zuul.nodepool.requested.size.<size>
# gauge zuul.nodepool.current_requests
state = request.state
if request.canceled:
state = 'canceled'
dt = None
elif request.state in (model.STATE_FULFILLED, model.STATE_FAILED):
dt = int((request.state_time - request.requested_time) * 1000)
else:
dt = None
key = 'zuul.nodepool.%s' % state
statsd.incr(key)
if dt:
statsd.timing(key, dt)
for node in request.nodeset.getNodes():
statsd.incr(key + '.label.%s' % node.label)
if dt:
statsd.timing(key + '.label.%s' % node.label, dt)
statsd.incr(key + '.size.%s' % len(request.nodeset.nodes))
if dt:
statsd.timing(key + '.size.%s' % len(request.nodeset.nodes), dt)
statsd.gauge('zuul.nodepool.current_requests', len(self.requests))
def requestNodes(self, build_set, job):
# Create a copy of the nodeset to represent the actual nodes
# returned by nodepool.
@ -33,6 +62,7 @@ class Nodepool(object):
self.sched.zk.submitNodeRequest(req, self._updateNodeRequest)
# Logged after submission so that we have the request id
self.log.info("Submited node request %s" % (req,))
self.emitStats(req)
else:
self.log.info("Fulfilling empty node request %s" % (req,))
req.state = model.STATE_FULFILLED
@ -142,6 +172,7 @@ class Nodepool(object):
if request.canceled:
del self.requests[request.uid]
self.emitStats(request)
return False
# TODOv3(jeblair): handle allocation failure
@ -156,6 +187,8 @@ class Nodepool(object):
self.sched.onNodesProvisioned(request)
del self.requests[request.uid]
self.emitStats(request)
# Stop watching this request node.
return False

Loading…
Cancel
Save