Add tenant name on NodeRequests for Nodepool

This change adds the tenant name of the current events' context to
NodeRequests and exposes it as a new field on ZooKeeper.  It prepares
for a tenant-aware Nodepool Launcher for it to enforce per-tenant
resource quota.  In addition, Zuul exposes a new statsd metric
``zuul.nodepool.tenant.<tenant>.current_requests`` that drills down the
overall current_requests metric per tenant.

Corresponding Spec can be found here
https://review.opendev.org/c/zuul/zuul/+/788481

Change-Id: I6d47431e939aba2c80f30504b7a48c15f9fc8fb7
This commit is contained in:
Benjamin Schanzel 2021-04-29 08:42:17 +02:00 committed by James E. Blair
parent 4f90d8ec1a
commit e577ec90bd
6 changed files with 49 additions and 3 deletions

View File

@ -330,6 +330,14 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
Persistently high values indicate more testing node resources
would be helpful.
.. stat:: tenant.<tenant>.current_requests
:type: gauge
The number of outstanding nodepool requests from Zuul drilled down by
<tenant>. If a tenant for a node request cannot be determed, it is
reported as ``unknown``. This relates to
``zuul.nodepool.current_requests``.
.. stat:: resources
Holds metrics about resource usage by tenant or project if resources

View File

@ -0,0 +1,7 @@
---
features:
- |
Add new statsd gauge metrics of current open node requests exported as
``zuul.nodepool.tenant.<tenant>.current_requests``. This metric tracks the
currently open node requests per tenant. It drills down the overall
``zuul.nodepool.current_requests`` metric.

View File

@ -493,8 +493,12 @@ class TestScheduler(ZuulTestCase):
'zuul.nodepool.requests.requested.size.1', value='1', kind='c')
self.assertReportedStat(
'zuul.nodepool.requests.fulfilled.size.1', value='1', kind='c')
# just check for existence, since we can not know if a request is
# in-flight during the sched._stats_inverval
self.assertReportedStat(
'zuul.nodepool.current_requests', value='1', kind='g')
'zuul.nodepool.current_requests', kind='g')
self.assertReportedStat(
'zuul.nodepool.tenant.tenant-one.current_requests', kind='g')
self.assertReportedStat(
'zuul.executors.online', value='1', kind='g')
self.assertReportedStat(

View File

@ -879,6 +879,7 @@ class NodeRequest(object):
d['state_time'] = self.state_time
d['relative_priority'] = self.relative_priority
d['event_id'] = self.event_id
d['tenant_name'] = self.tenant_name
return d
def updateFromDict(self, data):
@ -887,6 +888,11 @@ class NodeRequest(object):
self.state_time = data['state_time']
self.relative_priority = data.get('relative_priority', 0)
self.event_id = data['event_id']
# Make sure we don't update tenant_name to 'None'.
# This can happen if nodepool does not report one back and leads
# to errors at other places where we rely on that info.
if 'tenant_name' in data:
self.tenant_name = data['tenant_name']
@classmethod
def fromDict(cls, data):

View File

@ -54,7 +54,6 @@ class Nodepool(object):
# timer zuul.nodepool.requests.(fulfilled|failed)
# timer zuul.nodepool.requests.(fulfilled|failed).<label>
# timer zuul.nodepool.requests.(fulfilled|failed).<size>
# gauge zuul.nodepool.current_requests
if not self.statsd:
return
pipe = self.statsd.pipeline()
@ -78,7 +77,6 @@ class Nodepool(object):
pipe.incr(key + '.size.%s' % len(request.nodeset.nodes))
if dt:
pipe.timing(key + '.size.%s' % len(request.nodeset.nodes), dt)
pipe.gauge('zuul.nodepool.current_requests', len(self.requests))
pipe.send()
def emitStatsResources(self):

View File

@ -448,6 +448,29 @@ class Scheduler(threading.Thread):
self.statsd.gauge(f"{base}.management_events",
len(management_event_queues[pipeline.name]))
self.statsd.gauge('zuul.nodepool.current_requests',
len(self.nodepool.requests))
# export current_requests stats per tenant
tenant_requests = defaultdict(int)
# need to initialize tenants here explicitly to report a zero for
# all tenants for which no requests are in-flight
for tenant_name in self.abide.tenants.keys():
tenant_requests[tenant_name] = 0
for r in self.nodepool.requests.values():
# might be None, we ignore them for this metric
if not r.tenant_name:
continue
tenant_requests[r.tenant_name] += 1
for tenant, request_count in tenant_requests.items():
self.statsd.gauge(
"zuul.nodepool.tenant.{tenant}.current_requests",
request_count,
tenant=tenant)
def startCleanup(self):
# Run the first cleanup immediately after the first
# reconfiguration.