Report tenant and project specific resource usage stats
We currently lack means to support resource accounting of tenants or projects. Together with an addition to nodepool that adds resource metadata to nodes we can emit statsd statistics per tenant and per project. The following statistics are emitted: * zuul.nodepool.resources.tenant.{tenant}.{resource}.current Gauge with the currently used resources by tenant * zuul.nodepool.resources.project.{project}.{resource}.current Gauge with the currently used resources by project * zuul.nodepool.resources.tenant.{tenant}.{resource}.counter Counter with the summed usage by tenant. e.g. cpu seconds * zuul.nodepool.resources.project.{project}.{resource}.counter Counter with the summed usage by project. e.g. cpu seconds Depends-On: https://review.openstack.org/616262 Change-Id: I68ea68128287bf52d107959e1c343dfce98f1fc8
This commit is contained in:
parent
af910d9d8e
commit
e90fe41bfe
|
@ -288,6 +288,32 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
|
|||
Persistently high values indicate more testing node resources
|
||||
would be helpful.
|
||||
|
||||
.. stat:: resources
|
||||
|
||||
Holds metrics about resource usage by tenant or project if resources
|
||||
of nodes are reported by nodepool.
|
||||
|
||||
.. stat:: tenant
|
||||
|
||||
Holds resource usage metrics by tenant.
|
||||
|
||||
.. stat:: <tenant>.<resource>
|
||||
:type: counter, gauge
|
||||
|
||||
Counter with the summed usage by tenant as <resource> seconds and
|
||||
gauge with the currently used resources by tenant.
|
||||
|
||||
.. stat:: project
|
||||
|
||||
Holds resource usage metrics by project.
|
||||
|
||||
.. stat:: <project>.<resource>
|
||||
:type: counter, gauge
|
||||
|
||||
Counter with the summed usage by project as <resource> seconds and
|
||||
gauge with the currently used resources by tenant.
|
||||
|
||||
|
||||
.. stat:: zuul.mergers
|
||||
|
||||
Holds metrics related to Zuul mergers.
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
---
|
||||
features:
|
||||
- |
|
||||
Zuul now reports resource usage statistics if they are provided by nodepool.
|
||||
|
||||
The following statistics are emitted:
|
||||
|
||||
- `zuul.nodepool.resources.tenant.{tenant}.{resource}`:
|
||||
Gauge with the currently used resources by tenant and counter with the
|
||||
summed usage by tenant. e.g. cpu seconds
|
||||
|
||||
- `zuul.nodepool.resources.project.{project}.{resource}`:
|
||||
Gauge with the currently used resources by project and counter with the
|
||||
summed usage by project. e.g. cpu seconds
|
|
@ -1846,6 +1846,7 @@ class FakeNodepool(object):
|
|||
self.fail_requests = set()
|
||||
self.remote_ansible = False
|
||||
self.attributes = None
|
||||
self.resources = None
|
||||
|
||||
def stop(self):
|
||||
self._running = False
|
||||
|
@ -1951,6 +1952,8 @@ class FakeNodepool(object):
|
|||
host_keys=host_keys,
|
||||
executor='fake-nodepool',
|
||||
hold_expiration=None)
|
||||
if self.resources:
|
||||
data['resources'] = self.resources
|
||||
if self.remote_ansible:
|
||||
data['connection_type'] = 'ssh'
|
||||
if 'fakeuser' in node_type:
|
||||
|
|
|
@ -5317,6 +5317,72 @@ For CI problems and help debugging, contact ci@example.org"""
|
|||
self.assertIn('project-test1 : SKIPPED', A.messages[1])
|
||||
self.assertIn('project-test2 : SKIPPED', A.messages[1])
|
||||
|
||||
def test_nodepool_resources(self):
|
||||
"Test that resources are reported"
|
||||
|
||||
self.executor_server.hold_jobs_in_build = True
|
||||
self.fake_nodepool.resources = {
|
||||
'cores': 2,
|
||||
'ram': 1024,
|
||||
'instances': 1,
|
||||
}
|
||||
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
|
||||
A.addApproval('Code-Review', 2)
|
||||
self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
|
||||
self.waitUntilSettled()
|
||||
|
||||
self.executor_server.release('project-merge')
|
||||
self.waitUntilSettled()
|
||||
|
||||
# Check that resource usage gauges are reported
|
||||
self.assertHistory([
|
||||
dict(name='project-merge', result='SUCCESS', changes='1,1'),
|
||||
])
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.tenant.tenant-one.cores',
|
||||
value='2', kind='g')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.tenant.tenant-one.ram',
|
||||
value='1024', kind='g')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.tenant.tenant-one.instances',
|
||||
value='1', kind='g')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.project.review_example_com/org/project.'
|
||||
'cores', value='2', kind='g')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.project.review_example_com/org/project.'
|
||||
'ram', value='1024', kind='g')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.project.review_example_com/org/project.'
|
||||
'instances', value='1', kind='g')
|
||||
|
||||
# Check that resource usage counters are reported
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.tenant.tenant-one.cores',
|
||||
kind='c')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.tenant.tenant-one.ram',
|
||||
kind='c')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.tenant.tenant-one.instances',
|
||||
kind='c')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.project.review_example_com/org/project.'
|
||||
'cores', kind='c')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.project.review_example_com/org/project.'
|
||||
'ram', kind='c')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.project.review_example_com/org/project.'
|
||||
'instances', kind='c')
|
||||
|
||||
self.executor_server.hold_jobs_in_build = False
|
||||
self.executor_server.release()
|
||||
self.waitUntilSettled()
|
||||
self.assertEqual(A.data['status'], 'MERGED')
|
||||
self.assertEqual(A.reported, 2)
|
||||
|
||||
def test_nodepool_pipeline_priority(self):
|
||||
"Test that nodes are requested at the correct pipeline priority"
|
||||
|
||||
|
@ -6520,7 +6586,7 @@ class TestSemaphore(ZuulTestCase):
|
|||
# Simulate a single zk error in useNodeSet
|
||||
orig_useNodeSet = self.nodepool.useNodeSet
|
||||
|
||||
def broken_use_nodeset(nodeset):
|
||||
def broken_use_nodeset(nodeset, build_set=None):
|
||||
# restore original useNodeSet
|
||||
self.nodepool.useNodeSet = orig_useNodeSet
|
||||
raise NoNodeError()
|
||||
|
|
|
@ -403,7 +403,8 @@ class PipelineManager(object):
|
|||
self.log.debug("Found job %s for change %s" % (job, item.change))
|
||||
try:
|
||||
nodeset = item.current_build_set.getJobNodeSet(job.name)
|
||||
self.sched.nodepool.useNodeSet(nodeset)
|
||||
self.sched.nodepool.useNodeSet(
|
||||
nodeset, build_set=item.current_build_set)
|
||||
self.sched.executor.execute(
|
||||
job, item, self.pipeline,
|
||||
build_set.dependent_changes,
|
||||
|
|
|
@ -547,6 +547,7 @@ class Node(ConfigObject):
|
|||
self.region = None
|
||||
self.username = None
|
||||
self.hold_expiration = None
|
||||
self.resources = None
|
||||
|
||||
@property
|
||||
def state(self):
|
||||
|
|
|
@ -12,17 +12,30 @@
|
|||
|
||||
import logging
|
||||
|
||||
from collections import defaultdict
|
||||
from zuul import model
|
||||
from zuul.lib.logutil import get_annotated_logger
|
||||
from zuul.zk import LockException
|
||||
|
||||
|
||||
def add_resources(target, source):
|
||||
for key, value in source.items():
|
||||
target[key] += value
|
||||
|
||||
|
||||
def subtract_resources(target, source):
|
||||
for key, value in source.items():
|
||||
target[key] -= value
|
||||
|
||||
|
||||
class Nodepool(object):
|
||||
log = logging.getLogger('zuul.nodepool')
|
||||
|
||||
def __init__(self, scheduler):
|
||||
self.requests = {}
|
||||
self.sched = scheduler
|
||||
self.current_resources_by_tenant = {}
|
||||
self.current_resources_by_project = {}
|
||||
|
||||
def emitStats(self, request):
|
||||
# Implements the following :
|
||||
|
@ -60,6 +73,37 @@ class Nodepool(object):
|
|||
pipe.gauge('zuul.nodepool.current_requests', len(self.requests))
|
||||
pipe.send()
|
||||
|
||||
def emitStatsResources(self):
|
||||
if not self.sched.statsd:
|
||||
return
|
||||
statsd = self.sched.statsd
|
||||
|
||||
for tenant, resources in self.current_resources_by_tenant.items():
|
||||
for resource, value in resources.items():
|
||||
key = 'zuul.nodepool.resources.tenant.' \
|
||||
'{tenant}.{resource}'
|
||||
statsd.gauge(key, value, tenant=tenant, resource=resource)
|
||||
for project, resources in self.current_resources_by_project.items():
|
||||
for resource, value in resources.items():
|
||||
key = 'zuul.nodepool.resources.project.' \
|
||||
'{project}.{resource}'
|
||||
statsd.gauge(key, value, project=project, resource=resource)
|
||||
|
||||
def emitStatsResourceCounters(self, tenant, project, resources, duration):
|
||||
if not self.sched.statsd:
|
||||
return
|
||||
statsd = self.sched.statsd
|
||||
|
||||
for resource, value in resources.items():
|
||||
key = 'zuul.nodepool.resources.tenant.{tenant}.{resource}'
|
||||
statsd.incr(key, value * duration,
|
||||
tenant=tenant, resource=resource)
|
||||
for resource, value in resources.items():
|
||||
key = 'zuul.nodepool.resources.project.' \
|
||||
'{project}.{resource}'
|
||||
statsd.incr(key, value * duration,
|
||||
project=project, resource=resource)
|
||||
|
||||
def requestNodes(self, build_set, job, relative_priority, event=None):
|
||||
log = get_annotated_logger(self.log, event)
|
||||
# Create a copy of the nodeset to represent the actual nodes
|
||||
|
@ -157,22 +201,47 @@ class Nodepool(object):
|
|||
self.log.debug("Removing autohold for %s", autohold_key)
|
||||
del self.sched.autohold_requests[autohold_key]
|
||||
|
||||
def useNodeSet(self, nodeset):
|
||||
def useNodeSet(self, nodeset, build_set=None):
|
||||
self.log.info("Setting nodeset %s in use" % (nodeset,))
|
||||
resources = defaultdict(int)
|
||||
for node in nodeset.getNodes():
|
||||
if node.lock is None:
|
||||
raise Exception("Node %s is not locked" % (node,))
|
||||
node.state = model.STATE_IN_USE
|
||||
self.sched.zk.storeNode(node)
|
||||
if node.resources:
|
||||
add_resources(resources, node.resources)
|
||||
if build_set and resources:
|
||||
# we have a buildset and thus also tenant and project so we
|
||||
# can emit project specific resource usage stats
|
||||
tenant_name = build_set.item.layout.tenant.name
|
||||
project_name = build_set.item.change.project.canonical_name
|
||||
|
||||
self.current_resources_by_tenant.setdefault(
|
||||
tenant_name, defaultdict(int))
|
||||
self.current_resources_by_project.setdefault(
|
||||
project_name, defaultdict(int))
|
||||
|
||||
add_resources(self.current_resources_by_tenant[tenant_name],
|
||||
resources)
|
||||
add_resources(self.current_resources_by_project[project_name],
|
||||
resources)
|
||||
self.emitStatsResources()
|
||||
|
||||
def returnNodeSet(self, nodeset, build=None):
|
||||
self.log.info("Returning nodeset %s" % (nodeset,))
|
||||
resources = defaultdict(int)
|
||||
duration = None
|
||||
project = None
|
||||
tenant = None
|
||||
if build:
|
||||
project = build.build_set.item.change.project
|
||||
tenant = build.build_set.item.pipeline.tenant.name
|
||||
if (build and build.start_time and build.end_time and
|
||||
build.build_set and build.build_set.item and
|
||||
build.build_set.item.change and
|
||||
build.build_set.item.change.project):
|
||||
duration = build.end_time - build.start_time
|
||||
project = build.build_set.item.change.project
|
||||
self.log.info("Nodeset %s with %s nodes was in use "
|
||||
"for %s seconds for build %s for project %s",
|
||||
nodeset, len(nodeset.nodes), duration, build,
|
||||
|
@ -183,6 +252,8 @@ class Nodepool(object):
|
|||
else:
|
||||
try:
|
||||
if node.state == model.STATE_IN_USE:
|
||||
if node.resources:
|
||||
add_resources(resources, node.resources)
|
||||
node.state = model.STATE_USED
|
||||
self.sched.zk.storeNode(node)
|
||||
except Exception:
|
||||
|
@ -190,6 +261,22 @@ class Nodepool(object):
|
|||
"while unlocking:" % (node,))
|
||||
self._unlockNodes(nodeset.getNodes())
|
||||
|
||||
# When returning a nodeset we need to update the gauges if we have a
|
||||
# build. Further we calculate resource*duration and increment their
|
||||
# tenant or project specific counters. With that we have both the
|
||||
# current value and also counters to be able to perform accounting.
|
||||
if tenant and project and resources:
|
||||
project_name = project.canonical_name
|
||||
subtract_resources(
|
||||
self.current_resources_by_tenant[tenant], resources)
|
||||
subtract_resources(
|
||||
self.current_resources_by_project[project_name], resources)
|
||||
self.emitStatsResources()
|
||||
|
||||
if duration:
|
||||
self.emitStatsResourceCounters(
|
||||
tenant, project_name, resources, duration)
|
||||
|
||||
def unlockNodeSet(self, nodeset):
|
||||
self._unlockNodes(nodeset.getNodes())
|
||||
|
||||
|
|
Loading…
Reference in New Issue