Report gross/total tenant resource usage stats

Export a new statsd gauge with the total resources of a tenant.
Currently, we only export resources of in-use nodes. With this, we
additionally report the cummulative resources of all of a tenants nodes
(i.e. ready, deleting, ...).

This also renames the existing in-use resource stat to distinguish those
clearly.

Change-Id: I76a8c1212c7e9b476782403d52e4e22c030d1371
This commit is contained in:
Benjamin Schanzel 2022-02-16 09:23:49 +01:00
parent 7f3a0dcaab
commit eac322d252
9 changed files with 138 additions and 52 deletions

View File

@ -506,10 +506,14 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
reported as ``unknown``. This relates to
``zuul.nodepool.current_requests``.
.. stat:: resources
.. stat:: zuul.nodepool.resources
Holds metrics about resource usage by tenant or project if resources
of nodes are reported by nodepool.
Holds metrics about resource usage by tenant or project if resources
of nodes are reported by nodepool.
.. stat:: in_use
Holds metrics about resources currently in use by a build.
.. stat:: tenant
@ -519,7 +523,7 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
:type: counter, gauge
Counter with the summed usage by tenant as <resource> seconds and
gauge with the currently used resources by tenant.
gauge with the currently in use resources by tenant.
.. stat:: project
@ -531,6 +535,21 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
Counter with the summed usage by project as <resource> seconds and
gauge with the currently used resources by project.
.. stat:: total
Holds metrics about resources allocated in total. This includes
resources that are currently in use, allocated but not yet in use, and
scheduled to be deleted.
.. stat:: tenant
Holds resource usage metrics by tenant.
.. stat:: <tenant>.<resource>
:type: gauge
Gauge with the currently used resources by tenant.
.. stat:: zuul.mergers

View File

@ -0,0 +1,25 @@
---
features:
- |
Zuul now reports total resource usage statistics.
The following statistic is emitted:
`zuul.nodepool.resources.total.tenant.{tenant}.{resource}`
Gauge with the currently used resources by tenant in total, i.e., all nodes
belonging to a tenant, regardles of their state.
upgrade:
- |
Zuul now reports total and in-use resource usage statics separately.
To distinquish those, the following statistics have been renamed:
- `zuul.nodepool.resources.tenant.{tenant}.{resource}`
`->`
`zuul.nodepool.resources.in_use.tenant.{tenant}.{resource}`
- `zuul.nodepool.resources.project.{project}.{resource}`:
`->`
`zuul.nodepool.resources.in_use.project.{tenant}.{resource}`

View File

@ -3636,7 +3636,7 @@ class FakeNodepool(object):
nodes.append(data)
return nodes
def makeNode(self, request_id, node_type):
def makeNode(self, request_id, node_type, request):
now = time.time()
path = '/nodepool/nodes/'
remote_ip = os.environ.get('ZUUL_REMOTE_IPV4', '127.0.0.1')
@ -3695,6 +3695,8 @@ class FakeNodepool(object):
if 'fedora-pod' in node_type:
data['connection_type'] = 'kubectl'
data['connection_port']['pod'] = 'fedora-abcdefg'
data['tenant_name'] = request['tenant_name']
data['requestor'] = request['requestor']
data = json.dumps(data).encode('utf8')
path = self.client.create(path, data,
@ -3723,7 +3725,7 @@ class FakeNodepool(object):
request['state'] = 'fulfilled'
nodes = request.get('nodes', [])
for node in request['node_types']:
nodeid = self.makeNode(oid, node)
nodeid = self.makeNode(oid, node, request)
nodes.append(nodeid)
request['nodes'] = nodes

View File

@ -2009,8 +2009,9 @@ class TestScheduler(ZuulTestCase):
self.assertEqual([], request.nodes)
# Some convenience variables for checking the stats.
tenant_ram_stat = 'zuul.nodepool.resources.tenant.tenant-one.ram'
project_ram_stat = ('zuul.nodepool.resources.project.'
tenant_ram_stat =\
'zuul.nodepool.resources.in_use.tenant.tenant-one.ram'
project_ram_stat = ('zuul.nodepool.resources.in_use.project.'
'review_example_com/org/project.ram')
# Test that we zeroed the gauges
self.scheds.first.sched._runStats()
@ -5672,45 +5673,54 @@ For CI problems and help debugging, contact ci@example.org"""
self.assertHistory([
dict(name='project-merge', result='SUCCESS', changes='1,1'),
])
# All 3 nodes are in use
self.assertReportedStat(
'zuul.nodepool.resources.tenant.tenant-one.cores',
'zuul.nodepool.resources.total.tenant.tenant-one.cores',
value='6', kind='g')
self.assertReportedStat(
'zuul.nodepool.resources.tenant.tenant-one.ram',
'zuul.nodepool.resources.total.tenant.tenant-one.ram',
value='3072', kind='g')
self.assertReportedStat(
'zuul.nodepool.resources.tenant.tenant-one.instances',
'zuul.nodepool.resources.total.tenant.tenant-one.instances',
value='3', kind='g')
# All 3 nodes are in use
self.assertReportedStat(
'zuul.nodepool.resources.in_use.tenant.tenant-one.cores',
value='6', kind='g')
self.assertReportedStat(
'zuul.nodepool.resources.in_use.tenant.tenant-one.ram',
value='3072', kind='g')
self.assertReportedStat(
'zuul.nodepool.resources.in_use.tenant.tenant-one.instances',
value='3', kind='g')
self.assertReportedStat(
'zuul.nodepool.resources.project.review_example_com/org/project.'
'cores', value='6', kind='g')
'zuul.nodepool.resources.in_use.project.review_example_com/org/'
'project.cores', value='6', kind='g')
self.assertReportedStat(
'zuul.nodepool.resources.project.review_example_com/org/project.'
'ram', value='3072', kind='g')
'zuul.nodepool.resources.in_use.project.review_example_com/org/'
'project.ram', value='3072', kind='g')
self.assertReportedStat(
'zuul.nodepool.resources.project.review_example_com/org/project.'
'instances', value='3', kind='g')
'zuul.nodepool.resources.in_use.project.review_example_com/org/'
'project.instances', value='3', kind='g')
# Check that resource usage counters are reported
self.assertReportedStat(
'zuul.nodepool.resources.tenant.tenant-one.cores',
'zuul.nodepool.resources.in_use.tenant.tenant-one.cores',
kind='c')
self.assertReportedStat(
'zuul.nodepool.resources.tenant.tenant-one.ram',
'zuul.nodepool.resources.in_use.tenant.tenant-one.ram',
kind='c')
self.assertReportedStat(
'zuul.nodepool.resources.tenant.tenant-one.instances',
'zuul.nodepool.resources.in_use.tenant.tenant-one.instances',
kind='c')
self.assertReportedStat(
'zuul.nodepool.resources.project.review_example_com/org/project.'
'cores', kind='c')
'zuul.nodepool.resources.in_use.project.review_example_com/org/'
'project.cores', kind='c')
self.assertReportedStat(
'zuul.nodepool.resources.project.review_example_com/org/project.'
'ram', kind='c')
'zuul.nodepool.resources.in_use.project.review_example_com/org/'
'project.ram', kind='c')
self.assertReportedStat(
'zuul.nodepool.resources.project.review_example_com/org/project.'
'instances', kind='c')
'zuul.nodepool.resources.in_use.project.review_example_com/org/'
'project.instances', kind='c')
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()

View File

@ -387,7 +387,9 @@ class TestWeb(BaseTestWeb):
'label': 'label1',
'name': 'controller',
'aliases': [],
'requestor': None,
'state': 'unknown',
'tenant_name': None,
'user_data': None}],
},
'override_checkout': None,
@ -435,7 +437,9 @@ class TestWeb(BaseTestWeb):
'label': 'label2',
'name': 'controller',
'aliases': [],
'requestor': None,
'state': 'unknown',
'tenant_name': None,
'user_data': None}],
},
'override_checkout': None,
@ -1057,7 +1061,9 @@ class TestWeb(BaseTestWeb):
'id': None,
'label': 'label1',
'name': 'controller',
'requestor': None,
'state': 'unknown',
'tenant_name': None,
'user_data': None}]},
'override_branch': None,
'override_checkout': None,

View File

@ -1208,6 +1208,8 @@ class Node(ConfigObject):
self.resources = None
self.allocated_to = None
self.attributes = {}
self.tenant_name = None
self.requestor = None
@property
def state(self):
@ -1240,6 +1242,8 @@ class Node(ConfigObject):
d['hold_job'] = self.hold_job
d['comment'] = self.comment
d['user_data'] = self.user_data
d['tenant_name'] = self.tenant_name
d['requestor'] = self.requestor
for k in self._keys:
d[k] = getattr(self, k)
if internal_attributes:

View File

@ -167,11 +167,11 @@ class Nodepool(object):
return
for resource, value in resources.items():
key = 'zuul.nodepool.resources.tenant.{tenant}.{resource}'
key = 'zuul.nodepool.resources.in_use.tenant.{tenant}.{resource}'
self.statsd.incr(
key, value * duration, tenant=tenant, resource=resource)
for resource, value in resources.items():
key = 'zuul.nodepool.resources.project.' \
key = 'zuul.nodepool.resources.in_use.project.' \
'{project}.{resource}'
self.statsd.incr(
key, value * duration, project=project, resource=resource)
@ -315,7 +315,6 @@ class Nodepool(object):
log.info("Setting nodeset %s in use", nodeset)
user_data = dict(
zuul_system=self.system_id,
tenant_name=tenant_name,
project_name=project_name,
)
for node in nodeset.getNodes():
@ -483,16 +482,19 @@ class Nodepool(object):
total_requests = 0
tenant_requests = defaultdict(int)
resources_by_tenant = {}
resources_by_project = {}
in_use_resources_by_project = {}
in_use_resources_by_tenant = {}
total_resources_by_tenant = {}
empty_resource_dict = dict([(k, 0) for k in self.resource_types])
# Initialize zero values for gauges
for tenant in abide.tenants.values():
tenant_requests[tenant.name] = 0
resources_by_tenant[tenant.name] = empty_resource_dict.copy()
in_use_resources_by_tenant[tenant.name] =\
empty_resource_dict.copy()
total_resources_by_tenant[tenant.name] = empty_resource_dict.copy()
for project in tenant.all_projects:
resources_by_project[project.canonical_name] =\
in_use_resources_by_project[project.canonical_name] =\
empty_resource_dict.copy()
# Count node requests
@ -511,34 +513,52 @@ class Nodepool(object):
tenant=tenant)
# Count nodes
for node in self.getNodes():
for node in self.zk_nodepool.nodeIterator(cached=True):
if not node.resources:
continue
project_name = node.user_data.get('project_name')
tenant_name = node.user_data.get('tenant_name')
if not (project_name and tenant_name):
continue
tenant_name = node.tenant_name
if tenant_name in total_resources_by_tenant and \
node.requestor == self.system_id:
self.addResources(
total_resources_by_tenant[tenant_name],
node.resources)
# below here, we are only interested in nodes which are either
# in-use, used, or currently held by this zuul system
if node.state not in {model.STATE_IN_USE,
model.STATE_USED,
model.STATE_HOLD}:
continue
if tenant_name not in resources_by_tenant:
if not node.user_data:
continue
self.addResources(resources_by_tenant[tenant_name],
node.resources)
if project_name not in resources_by_project:
if node.user_data.get('zuul_system') != self.system_id:
continue
self.addResources(resources_by_project[project_name],
node.resources)
for tenant, resources in resources_by_tenant.items():
if tenant_name in in_use_resources_by_tenant:
self.addResources(
in_use_resources_by_tenant[tenant_name],
node.resources)
project_name = node.user_data.get('project_name')
if project_name in in_use_resources_by_project:
self.addResources(
in_use_resources_by_project[project_name],
node.resources)
for tenant, resources in total_resources_by_tenant.items():
for resource, value in resources.items():
key = 'zuul.nodepool.resources.tenant.' \
key = 'zuul.nodepool.resources.total.tenant.' \
'{tenant}.{resource}'
self.statsd.gauge(key, value, tenant=tenant, resource=resource)
for project, resources in resources_by_project.items():
for tenant, resources in in_use_resources_by_tenant.items():
for resource, value in resources.items():
key = 'zuul.nodepool.resources.project.' \
key = 'zuul.nodepool.resources.in_use.tenant.' \
'{tenant}.{resource}'
self.statsd.gauge(key, value, tenant=tenant, resource=resource)
for project, resources in in_use_resources_by_project.items():
for resource, value in resources.items():
key = 'zuul.nodepool.resources.in_use.project.' \
'{project}.{resource}'
self.statsd.gauge(
key, value, project=project, resource=resource)

View File

@ -1250,7 +1250,7 @@ class ZuulWebAPI(object):
isinstance(node.user_data, dict) and
node.user_data.get('zuul_system') ==
self.system.system_id and
node.user_data.get('tenant_name') == tenant):
node.tenant_name == tenant):
continue
node_data = {}
for key in ("id", "type", "connection_type", "external_id",

View File

@ -193,11 +193,11 @@ class ZooKeeperNodepool(ZooKeeperBase):
node.stat = stat
return node
def nodeIterator(self):
def nodeIterator(self, cached=False):
"""
Utility generator method for iterating through all nodes.
"""
for node_id in self.getNodes():
for node_id in self.getNodes(cached):
node = self.getNode(node_id)
if node:
yield node