Add tenant name on NodeRequests for Nodepool

This change adds the tenant name of the current events' context to
NodeRequests and exposes it as a new field on ZooKeeper.  It prepares
for a tenant-aware Nodepool Launcher for it to enforce per-tenant
resource quota.  In addition, Zuul exposes a new statsd metric
``zuul.nodepool.tenant.<tenant>.current_requests`` that drills down the
overall current_requests metric per tenant.

Corresponding Spec can be found here
https://review.opendev.org/c/zuul/zuul/+/788481

Change-Id: I6d47431e939aba2c80f30504b7a48c15f9fc8fb7
This commit is contained in:
Benjamin Schanzel 2021-04-29 08:42:17 +02:00
parent fd028206de
commit 282a2ae391
8 changed files with 57 additions and 4 deletions

View File

@ -305,6 +305,14 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
Persistently high values indicate more testing node resources
would be helpful.
.. stat:: tenant.<tenant>.current_requests
:type: gauge
The number of outstanding nodepool requests from Zuul drilled down by
<tenant>. If a tenant for a node request cannot be determed, it is
reported as ``unknown``. This relates to
``zuul.nodepool.current_requests``.
.. stat:: resources
Holds metrics about resource usage by tenant or project if resources

View File

@ -0,0 +1,9 @@
---
features:
- |
Add the name of the tenant to node requests. This new field is passed to
ZooKeeper in preparation of a tenant-aware nodepool. Also, new statsd
gauge metrics of current open node requests are exported in the form
``zuul.nodepool.tenant.<tenant>.current_requests``. This metric tracks the
currently open node requests per tenant. It drills down the overall
``zuul.nodepool.current_requests`` metric.

View File

@ -35,12 +35,14 @@ class TestNodepoolIntegration(BaseTestCase):
self.addCleanup(self.zk_client.disconnect)
self.zk_client.connect()
self.hostname = socket.gethostname()
# empty tenants dict
self.tenants = {}
self.provisioned_requests = []
# This class implements the scheduler methods zuul.nodepool
# needs, so we pass 'self' as the scheduler.
self.nodepool = zuul.nodepool.Nodepool(
self.zk_client, self.hostname, self.statsd, self)
self.zk_client, self.hostname, self.statsd, self.tenants, self)
def waitForRequests(self):
# Wait until all requests are complete.

View File

@ -42,12 +42,14 @@ class TestNodepool(BaseTestCase):
self.addCleanup(self.zk_client.disconnect)
self.zk_client.connect()
self.hostname = 'nodepool-test-hostname'
# empty tenants dict
self.tenants = {}
self.provisioned_requests = []
# This class implements the scheduler methods zuul.nodepool
# needs, so we pass 'self' as the scheduler.
self.nodepool = zuul.nodepool.Nodepool(
self.zk_client, self.hostname, self.statsd, self)
self.zk_client, self.hostname, self.statsd, self.tenants, self)
self.fake_nodepool = FakeNodepool(self.zk_chroot_fixture)
self.addCleanup(self.fake_nodepool.stop)

View File

@ -432,6 +432,9 @@ class TestScheduler(ZuulTestCase):
'zuul.nodepool.requests.fulfilled.size.1', value='1', kind='c')
self.assertReportedStat(
'zuul.nodepool.current_requests', value='1', kind='g')
self.assertReportedStat(
'zuul.nodepool.tenant.tenant-one.current_requests', value='1',
kind='g')
self.assertReportedStat(
'zuul.executors.online', value='1', kind='g')
self.assertReportedStat(

View File

@ -801,6 +801,10 @@ class NodeRequest(object):
self.relative_priority = relative_priority
self.provider = self._getPausedParentProvider()
self.id = None
if build_set is not None:
self.tenant = build_set.item.pipeline.tenant.name
else:
self.tenant = None
self._zk_data = {} # Data that we read back from ZK
if event is not None:
self.event_id = event.zuul_event_id
@ -874,6 +878,7 @@ class NodeRequest(object):
d['state_time'] = self.state_time
d['relative_priority'] = self.relative_priority
d['event_id'] = self.event_id
d['tenant'] = self.tenant
return d
def updateFromDict(self, data):

View File

@ -33,9 +33,10 @@ def subtract_resources(target, source):
class Nodepool(object):
log = logging.getLogger('zuul.nodepool')
def __init__(self, zk_client, hostname, statsd, scheduler=None):
def __init__(self, zk_client, hostname, statsd, tenants, scheduler=None):
self.hostname = hostname
self.statsd = statsd
self.tenants = tenants
# TODO (felix): Remove the scheduler parameter once the nodes are
# locked on the executor side.
self.sched = scheduler
@ -55,6 +56,7 @@ class Nodepool(object):
# timer zuul.nodepool.requests.(fulfilled|failed).<label>
# timer zuul.nodepool.requests.(fulfilled|failed).<size>
# gauge zuul.nodepool.current_requests
# gauge zuul.nodepool.tenant.<tenant>.current_requests
if not self.statsd:
return
pipe = self.statsd.pipeline()
@ -79,6 +81,27 @@ class Nodepool(object):
if dt:
pipe.timing(key + '.size.%s' % len(request.nodeset.nodes), dt)
pipe.gauge('zuul.nodepool.current_requests', len(self.requests))
# count the current requests of all tenants
# first get all currently configured tenants
tenant_requests = defaultdict(int)
for tenant_name in self.tenants.keys():
tenant_requests[tenant_name] = 0
for r in self.requests.values():
# (might be None, we report them separately as 'unknown')
tenant_name = r.tenant if r.tenant else 'unknown'
tenant_requests[tenant_name] += 1
# export current_requests stats per tenant
for tenant, request_count in tenant_requests.items():
# the custom statsd clients' format is not supported for pipelines
# therefore call _format_stat here manually.
stats_key = self.statsd._format_stat(
'zuul.nodepool.tenant.{tenant}.current_requests',
tenant=tenant)
pipe.gauge(stats_key, request_count)
pipe.send()
def emitStatsResources(self):

View File

@ -220,7 +220,8 @@ class Scheduler(threading.Thread):
self.executor = ExecutorClient(self.config, self)
self.merger = self._merger_client_class(self.config, self)
self.nodepool = nodepool.Nodepool(
self.zk_client, self.hostname, self.statsd, self)
self.zk_client, self.hostname, self.statsd,
self.abide.tenants, self)
def start(self):
super(Scheduler, self).start()