Report tenant and project specific resource usage stats

We currently lack means to support resource accounting of tenants or
projects. Together with an addition to nodepool that adds resource
metadata to nodes we can emit statsd statistics per tenant and per
project.

The following statistics are emitted:
* zuul.nodepool.resources.tenant.{tenant}.{resource}.current
  Gauge with the currently used resources by tenant

* zuul.nodepool.resources.project.{project}.{resource}.current
  Gauge with the currently used resources by project

* zuul.nodepool.resources.tenant.{tenant}.{resource}.counter
  Counter with the summed usage by tenant. e.g. cpu seconds

* zuul.nodepool.resources.project.{project}.{resource}.counter
  Counter with the summed usage by project. e.g. cpu seconds

Depends-On: https://review.openstack.org/616262
Change-Id: I68ea68128287bf52d107959e1c343dfce98f1fc8
This commit is contained in:
Tobias Henkel 2018-11-07 22:01:14 +01:00
parent af910d9d8e
commit e90fe41bfe
7 changed files with 202 additions and 4 deletions

View File

@ -288,6 +288,32 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
Persistently high values indicate more testing node resources
would be helpful.
.. stat:: resources
Holds metrics about resource usage by tenant or project if resources
of nodes are reported by nodepool.
.. stat:: tenant
Holds resource usage metrics by tenant.
.. stat:: <tenant>.<resource>
:type: counter, gauge
Counter with the summed usage by tenant as <resource> seconds and
gauge with the currently used resources by tenant.
.. stat:: project
Holds resource usage metrics by project.
.. stat:: <project>.<resource>
:type: counter, gauge
Counter with the summed usage by project as <resource> seconds and
gauge with the currently used resources by tenant.
.. stat:: zuul.mergers
Holds metrics related to Zuul mergers.

View File

@ -0,0 +1,14 @@
---
features:
- |
Zuul now reports resource usage statistics if they are provided by nodepool.
The following statistics are emitted:
- `zuul.nodepool.resources.tenant.{tenant}.{resource}`:
Gauge with the currently used resources by tenant and counter with the
summed usage by tenant. e.g. cpu seconds
- `zuul.nodepool.resources.project.{project}.{resource}`:
Gauge with the currently used resources by project and counter with the
summed usage by project. e.g. cpu seconds

View File

@ -1846,6 +1846,7 @@ class FakeNodepool(object):
self.fail_requests = set()
self.remote_ansible = False
self.attributes = None
self.resources = None
def stop(self):
self._running = False
@ -1951,6 +1952,8 @@ class FakeNodepool(object):
host_keys=host_keys,
executor='fake-nodepool',
hold_expiration=None)
if self.resources:
data['resources'] = self.resources
if self.remote_ansible:
data['connection_type'] = 'ssh'
if 'fakeuser' in node_type:

View File

@ -5317,6 +5317,72 @@ For CI problems and help debugging, contact ci@example.org"""
self.assertIn('project-test1 : SKIPPED', A.messages[1])
self.assertIn('project-test2 : SKIPPED', A.messages[1])
def test_nodepool_resources(self):
"Test that resources are reported"
self.executor_server.hold_jobs_in_build = True
self.fake_nodepool.resources = {
'cores': 2,
'ram': 1024,
'instances': 1,
}
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
A.addApproval('Code-Review', 2)
self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
self.waitUntilSettled()
self.executor_server.release('project-merge')
self.waitUntilSettled()
# Check that resource usage gauges are reported
self.assertHistory([
dict(name='project-merge', result='SUCCESS', changes='1,1'),
])
self.assertReportedStat(
'zuul.nodepool.resources.tenant.tenant-one.cores',
value='2', kind='g')
self.assertReportedStat(
'zuul.nodepool.resources.tenant.tenant-one.ram',
value='1024', kind='g')
self.assertReportedStat(
'zuul.nodepool.resources.tenant.tenant-one.instances',
value='1', kind='g')
self.assertReportedStat(
'zuul.nodepool.resources.project.review_example_com/org/project.'
'cores', value='2', kind='g')
self.assertReportedStat(
'zuul.nodepool.resources.project.review_example_com/org/project.'
'ram', value='1024', kind='g')
self.assertReportedStat(
'zuul.nodepool.resources.project.review_example_com/org/project.'
'instances', value='1', kind='g')
# Check that resource usage counters are reported
self.assertReportedStat(
'zuul.nodepool.resources.tenant.tenant-one.cores',
kind='c')
self.assertReportedStat(
'zuul.nodepool.resources.tenant.tenant-one.ram',
kind='c')
self.assertReportedStat(
'zuul.nodepool.resources.tenant.tenant-one.instances',
kind='c')
self.assertReportedStat(
'zuul.nodepool.resources.project.review_example_com/org/project.'
'cores', kind='c')
self.assertReportedStat(
'zuul.nodepool.resources.project.review_example_com/org/project.'
'ram', kind='c')
self.assertReportedStat(
'zuul.nodepool.resources.project.review_example_com/org/project.'
'instances', kind='c')
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()
self.waitUntilSettled()
self.assertEqual(A.data['status'], 'MERGED')
self.assertEqual(A.reported, 2)
def test_nodepool_pipeline_priority(self):
"Test that nodes are requested at the correct pipeline priority"
@ -6520,7 +6586,7 @@ class TestSemaphore(ZuulTestCase):
# Simulate a single zk error in useNodeSet
orig_useNodeSet = self.nodepool.useNodeSet
def broken_use_nodeset(nodeset):
def broken_use_nodeset(nodeset, build_set=None):
# restore original useNodeSet
self.nodepool.useNodeSet = orig_useNodeSet
raise NoNodeError()

View File

@ -403,7 +403,8 @@ class PipelineManager(object):
self.log.debug("Found job %s for change %s" % (job, item.change))
try:
nodeset = item.current_build_set.getJobNodeSet(job.name)
self.sched.nodepool.useNodeSet(nodeset)
self.sched.nodepool.useNodeSet(
nodeset, build_set=item.current_build_set)
self.sched.executor.execute(
job, item, self.pipeline,
build_set.dependent_changes,

View File

@ -547,6 +547,7 @@ class Node(ConfigObject):
self.region = None
self.username = None
self.hold_expiration = None
self.resources = None
@property
def state(self):

View File

@ -12,17 +12,30 @@
import logging
from collections import defaultdict
from zuul import model
from zuul.lib.logutil import get_annotated_logger
from zuul.zk import LockException
def add_resources(target, source):
for key, value in source.items():
target[key] += value
def subtract_resources(target, source):
for key, value in source.items():
target[key] -= value
class Nodepool(object):
log = logging.getLogger('zuul.nodepool')
def __init__(self, scheduler):
self.requests = {}
self.sched = scheduler
self.current_resources_by_tenant = {}
self.current_resources_by_project = {}
def emitStats(self, request):
# Implements the following :
@ -60,6 +73,37 @@ class Nodepool(object):
pipe.gauge('zuul.nodepool.current_requests', len(self.requests))
pipe.send()
def emitStatsResources(self):
if not self.sched.statsd:
return
statsd = self.sched.statsd
for tenant, resources in self.current_resources_by_tenant.items():
for resource, value in resources.items():
key = 'zuul.nodepool.resources.tenant.' \
'{tenant}.{resource}'
statsd.gauge(key, value, tenant=tenant, resource=resource)
for project, resources in self.current_resources_by_project.items():
for resource, value in resources.items():
key = 'zuul.nodepool.resources.project.' \
'{project}.{resource}'
statsd.gauge(key, value, project=project, resource=resource)
def emitStatsResourceCounters(self, tenant, project, resources, duration):
if not self.sched.statsd:
return
statsd = self.sched.statsd
for resource, value in resources.items():
key = 'zuul.nodepool.resources.tenant.{tenant}.{resource}'
statsd.incr(key, value * duration,
tenant=tenant, resource=resource)
for resource, value in resources.items():
key = 'zuul.nodepool.resources.project.' \
'{project}.{resource}'
statsd.incr(key, value * duration,
project=project, resource=resource)
def requestNodes(self, build_set, job, relative_priority, event=None):
log = get_annotated_logger(self.log, event)
# Create a copy of the nodeset to represent the actual nodes
@ -157,22 +201,47 @@ class Nodepool(object):
self.log.debug("Removing autohold for %s", autohold_key)
del self.sched.autohold_requests[autohold_key]
def useNodeSet(self, nodeset):
def useNodeSet(self, nodeset, build_set=None):
self.log.info("Setting nodeset %s in use" % (nodeset,))
resources = defaultdict(int)
for node in nodeset.getNodes():
if node.lock is None:
raise Exception("Node %s is not locked" % (node,))
node.state = model.STATE_IN_USE
self.sched.zk.storeNode(node)
if node.resources:
add_resources(resources, node.resources)
if build_set and resources:
# we have a buildset and thus also tenant and project so we
# can emit project specific resource usage stats
tenant_name = build_set.item.layout.tenant.name
project_name = build_set.item.change.project.canonical_name
self.current_resources_by_tenant.setdefault(
tenant_name, defaultdict(int))
self.current_resources_by_project.setdefault(
project_name, defaultdict(int))
add_resources(self.current_resources_by_tenant[tenant_name],
resources)
add_resources(self.current_resources_by_project[project_name],
resources)
self.emitStatsResources()
def returnNodeSet(self, nodeset, build=None):
self.log.info("Returning nodeset %s" % (nodeset,))
resources = defaultdict(int)
duration = None
project = None
tenant = None
if build:
project = build.build_set.item.change.project
tenant = build.build_set.item.pipeline.tenant.name
if (build and build.start_time and build.end_time and
build.build_set and build.build_set.item and
build.build_set.item.change and
build.build_set.item.change.project):
duration = build.end_time - build.start_time
project = build.build_set.item.change.project
self.log.info("Nodeset %s with %s nodes was in use "
"for %s seconds for build %s for project %s",
nodeset, len(nodeset.nodes), duration, build,
@ -183,6 +252,8 @@ class Nodepool(object):
else:
try:
if node.state == model.STATE_IN_USE:
if node.resources:
add_resources(resources, node.resources)
node.state = model.STATE_USED
self.sched.zk.storeNode(node)
except Exception:
@ -190,6 +261,22 @@ class Nodepool(object):
"while unlocking:" % (node,))
self._unlockNodes(nodeset.getNodes())
# When returning a nodeset we need to update the gauges if we have a
# build. Further we calculate resource*duration and increment their
# tenant or project specific counters. With that we have both the
# current value and also counters to be able to perform accounting.
if tenant and project and resources:
project_name = project.canonical_name
subtract_resources(
self.current_resources_by_tenant[tenant], resources)
subtract_resources(
self.current_resources_by_project[project_name], resources)
self.emitStatsResources()
if duration:
self.emitStatsResourceCounters(
tenant, project_name, resources, duration)
def unlockNodeSet(self, nodeset):
self._unlockNodes(nodeset.getNodes())