Add tenant name on NodeRequests for Nodepool
This change adds the tenant name of the current events' context to NodeRequests and exposes it as a new field on ZooKeeper. It prepares for a tenant-aware Nodepool Launcher for it to enforce per-tenant resource quota. In addition, Zuul exposes a new statsd metric ``zuul.nodepool.tenant.<tenant>.current_requests`` that drills down the overall current_requests metric per tenant. Corresponding Spec can be found here https://review.opendev.org/c/zuul/zuul/+/788481 Change-Id: I6d47431e939aba2c80f30504b7a48c15f9fc8fb7
This commit is contained in:
parent
4f90d8ec1a
commit
e577ec90bd
@ -330,6 +330,14 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
|
||||
Persistently high values indicate more testing node resources
|
||||
would be helpful.
|
||||
|
||||
.. stat:: tenant.<tenant>.current_requests
|
||||
:type: gauge
|
||||
|
||||
The number of outstanding nodepool requests from Zuul drilled down by
|
||||
<tenant>. If a tenant for a node request cannot be determed, it is
|
||||
reported as ``unknown``. This relates to
|
||||
``zuul.nodepool.current_requests``.
|
||||
|
||||
.. stat:: resources
|
||||
|
||||
Holds metrics about resource usage by tenant or project if resources
|
||||
|
@ -0,0 +1,7 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Add new statsd gauge metrics of current open node requests exported as
|
||||
``zuul.nodepool.tenant.<tenant>.current_requests``. This metric tracks the
|
||||
currently open node requests per tenant. It drills down the overall
|
||||
``zuul.nodepool.current_requests`` metric.
|
@ -493,8 +493,12 @@ class TestScheduler(ZuulTestCase):
|
||||
'zuul.nodepool.requests.requested.size.1', value='1', kind='c')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.requests.fulfilled.size.1', value='1', kind='c')
|
||||
# just check for existence, since we can not know if a request is
|
||||
# in-flight during the sched._stats_inverval
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.current_requests', value='1', kind='g')
|
||||
'zuul.nodepool.current_requests', kind='g')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.tenant.tenant-one.current_requests', kind='g')
|
||||
self.assertReportedStat(
|
||||
'zuul.executors.online', value='1', kind='g')
|
||||
self.assertReportedStat(
|
||||
|
@ -879,6 +879,7 @@ class NodeRequest(object):
|
||||
d['state_time'] = self.state_time
|
||||
d['relative_priority'] = self.relative_priority
|
||||
d['event_id'] = self.event_id
|
||||
d['tenant_name'] = self.tenant_name
|
||||
return d
|
||||
|
||||
def updateFromDict(self, data):
|
||||
@ -887,6 +888,11 @@ class NodeRequest(object):
|
||||
self.state_time = data['state_time']
|
||||
self.relative_priority = data.get('relative_priority', 0)
|
||||
self.event_id = data['event_id']
|
||||
# Make sure we don't update tenant_name to 'None'.
|
||||
# This can happen if nodepool does not report one back and leads
|
||||
# to errors at other places where we rely on that info.
|
||||
if 'tenant_name' in data:
|
||||
self.tenant_name = data['tenant_name']
|
||||
|
||||
@classmethod
|
||||
def fromDict(cls, data):
|
||||
|
@ -54,7 +54,6 @@ class Nodepool(object):
|
||||
# timer zuul.nodepool.requests.(fulfilled|failed)
|
||||
# timer zuul.nodepool.requests.(fulfilled|failed).<label>
|
||||
# timer zuul.nodepool.requests.(fulfilled|failed).<size>
|
||||
# gauge zuul.nodepool.current_requests
|
||||
if not self.statsd:
|
||||
return
|
||||
pipe = self.statsd.pipeline()
|
||||
@ -78,7 +77,6 @@ class Nodepool(object):
|
||||
pipe.incr(key + '.size.%s' % len(request.nodeset.nodes))
|
||||
if dt:
|
||||
pipe.timing(key + '.size.%s' % len(request.nodeset.nodes), dt)
|
||||
pipe.gauge('zuul.nodepool.current_requests', len(self.requests))
|
||||
pipe.send()
|
||||
|
||||
def emitStatsResources(self):
|
||||
|
@ -448,6 +448,29 @@ class Scheduler(threading.Thread):
|
||||
self.statsd.gauge(f"{base}.management_events",
|
||||
len(management_event_queues[pipeline.name]))
|
||||
|
||||
self.statsd.gauge('zuul.nodepool.current_requests',
|
||||
len(self.nodepool.requests))
|
||||
|
||||
# export current_requests stats per tenant
|
||||
tenant_requests = defaultdict(int)
|
||||
|
||||
# need to initialize tenants here explicitly to report a zero for
|
||||
# all tenants for which no requests are in-flight
|
||||
for tenant_name in self.abide.tenants.keys():
|
||||
tenant_requests[tenant_name] = 0
|
||||
|
||||
for r in self.nodepool.requests.values():
|
||||
# might be None, we ignore them for this metric
|
||||
if not r.tenant_name:
|
||||
continue
|
||||
tenant_requests[r.tenant_name] += 1
|
||||
|
||||
for tenant, request_count in tenant_requests.items():
|
||||
self.statsd.gauge(
|
||||
"zuul.nodepool.tenant.{tenant}.current_requests",
|
||||
request_count,
|
||||
tenant=tenant)
|
||||
|
||||
def startCleanup(self):
|
||||
# Run the first cleanup immediately after the first
|
||||
# reconfiguration.
|
||||
|
Loading…
x
Reference in New Issue
Block a user