Report gross/total tenant resource usage stats
Export a new statsd gauge with the total resources of a tenant. Currently, we only export resources of in-use nodes. With this, we additionally report the cummulative resources of all of a tenants nodes (i.e. ready, deleting, ...). This also renames the existing in-use resource stat to distinguish those clearly. Change-Id: I76a8c1212c7e9b476782403d52e4e22c030d1371
This commit is contained in:
parent
7f3a0dcaab
commit
eac322d252
|
@ -506,10 +506,14 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
|
|||
reported as ``unknown``. This relates to
|
||||
``zuul.nodepool.current_requests``.
|
||||
|
||||
.. stat:: resources
|
||||
.. stat:: zuul.nodepool.resources
|
||||
|
||||
Holds metrics about resource usage by tenant or project if resources
|
||||
of nodes are reported by nodepool.
|
||||
Holds metrics about resource usage by tenant or project if resources
|
||||
of nodes are reported by nodepool.
|
||||
|
||||
.. stat:: in_use
|
||||
|
||||
Holds metrics about resources currently in use by a build.
|
||||
|
||||
.. stat:: tenant
|
||||
|
||||
|
@ -519,7 +523,7 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
|
|||
:type: counter, gauge
|
||||
|
||||
Counter with the summed usage by tenant as <resource> seconds and
|
||||
gauge with the currently used resources by tenant.
|
||||
gauge with the currently in use resources by tenant.
|
||||
|
||||
.. stat:: project
|
||||
|
||||
|
@ -531,6 +535,21 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
|
|||
Counter with the summed usage by project as <resource> seconds and
|
||||
gauge with the currently used resources by project.
|
||||
|
||||
.. stat:: total
|
||||
|
||||
Holds metrics about resources allocated in total. This includes
|
||||
resources that are currently in use, allocated but not yet in use, and
|
||||
scheduled to be deleted.
|
||||
|
||||
.. stat:: tenant
|
||||
|
||||
Holds resource usage metrics by tenant.
|
||||
|
||||
.. stat:: <tenant>.<resource>
|
||||
:type: gauge
|
||||
|
||||
Gauge with the currently used resources by tenant.
|
||||
|
||||
|
||||
.. stat:: zuul.mergers
|
||||
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
---
|
||||
features:
|
||||
- |
|
||||
Zuul now reports total resource usage statistics.
|
||||
|
||||
The following statistic is emitted:
|
||||
|
||||
`zuul.nodepool.resources.total.tenant.{tenant}.{resource}`
|
||||
|
||||
Gauge with the currently used resources by tenant in total, i.e., all nodes
|
||||
belonging to a tenant, regardles of their state.
|
||||
|
||||
upgrade:
|
||||
- |
|
||||
Zuul now reports total and in-use resource usage statics separately.
|
||||
|
||||
To distinquish those, the following statistics have been renamed:
|
||||
|
||||
- `zuul.nodepool.resources.tenant.{tenant}.{resource}`
|
||||
`->`
|
||||
`zuul.nodepool.resources.in_use.tenant.{tenant}.{resource}`
|
||||
|
||||
- `zuul.nodepool.resources.project.{project}.{resource}`:
|
||||
`->`
|
||||
`zuul.nodepool.resources.in_use.project.{tenant}.{resource}`
|
|
@ -3636,7 +3636,7 @@ class FakeNodepool(object):
|
|||
nodes.append(data)
|
||||
return nodes
|
||||
|
||||
def makeNode(self, request_id, node_type):
|
||||
def makeNode(self, request_id, node_type, request):
|
||||
now = time.time()
|
||||
path = '/nodepool/nodes/'
|
||||
remote_ip = os.environ.get('ZUUL_REMOTE_IPV4', '127.0.0.1')
|
||||
|
@ -3695,6 +3695,8 @@ class FakeNodepool(object):
|
|||
if 'fedora-pod' in node_type:
|
||||
data['connection_type'] = 'kubectl'
|
||||
data['connection_port']['pod'] = 'fedora-abcdefg'
|
||||
data['tenant_name'] = request['tenant_name']
|
||||
data['requestor'] = request['requestor']
|
||||
|
||||
data = json.dumps(data).encode('utf8')
|
||||
path = self.client.create(path, data,
|
||||
|
@ -3723,7 +3725,7 @@ class FakeNodepool(object):
|
|||
request['state'] = 'fulfilled'
|
||||
nodes = request.get('nodes', [])
|
||||
for node in request['node_types']:
|
||||
nodeid = self.makeNode(oid, node)
|
||||
nodeid = self.makeNode(oid, node, request)
|
||||
nodes.append(nodeid)
|
||||
request['nodes'] = nodes
|
||||
|
||||
|
|
|
@ -2009,8 +2009,9 @@ class TestScheduler(ZuulTestCase):
|
|||
self.assertEqual([], request.nodes)
|
||||
|
||||
# Some convenience variables for checking the stats.
|
||||
tenant_ram_stat = 'zuul.nodepool.resources.tenant.tenant-one.ram'
|
||||
project_ram_stat = ('zuul.nodepool.resources.project.'
|
||||
tenant_ram_stat =\
|
||||
'zuul.nodepool.resources.in_use.tenant.tenant-one.ram'
|
||||
project_ram_stat = ('zuul.nodepool.resources.in_use.project.'
|
||||
'review_example_com/org/project.ram')
|
||||
# Test that we zeroed the gauges
|
||||
self.scheds.first.sched._runStats()
|
||||
|
@ -5672,45 +5673,54 @@ For CI problems and help debugging, contact ci@example.org"""
|
|||
self.assertHistory([
|
||||
dict(name='project-merge', result='SUCCESS', changes='1,1'),
|
||||
])
|
||||
# All 3 nodes are in use
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.tenant.tenant-one.cores',
|
||||
'zuul.nodepool.resources.total.tenant.tenant-one.cores',
|
||||
value='6', kind='g')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.tenant.tenant-one.ram',
|
||||
'zuul.nodepool.resources.total.tenant.tenant-one.ram',
|
||||
value='3072', kind='g')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.tenant.tenant-one.instances',
|
||||
'zuul.nodepool.resources.total.tenant.tenant-one.instances',
|
||||
value='3', kind='g')
|
||||
# All 3 nodes are in use
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.in_use.tenant.tenant-one.cores',
|
||||
value='6', kind='g')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.in_use.tenant.tenant-one.ram',
|
||||
value='3072', kind='g')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.in_use.tenant.tenant-one.instances',
|
||||
value='3', kind='g')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.project.review_example_com/org/project.'
|
||||
'cores', value='6', kind='g')
|
||||
'zuul.nodepool.resources.in_use.project.review_example_com/org/'
|
||||
'project.cores', value='6', kind='g')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.project.review_example_com/org/project.'
|
||||
'ram', value='3072', kind='g')
|
||||
'zuul.nodepool.resources.in_use.project.review_example_com/org/'
|
||||
'project.ram', value='3072', kind='g')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.project.review_example_com/org/project.'
|
||||
'instances', value='3', kind='g')
|
||||
'zuul.nodepool.resources.in_use.project.review_example_com/org/'
|
||||
'project.instances', value='3', kind='g')
|
||||
|
||||
# Check that resource usage counters are reported
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.tenant.tenant-one.cores',
|
||||
'zuul.nodepool.resources.in_use.tenant.tenant-one.cores',
|
||||
kind='c')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.tenant.tenant-one.ram',
|
||||
'zuul.nodepool.resources.in_use.tenant.tenant-one.ram',
|
||||
kind='c')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.tenant.tenant-one.instances',
|
||||
'zuul.nodepool.resources.in_use.tenant.tenant-one.instances',
|
||||
kind='c')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.project.review_example_com/org/project.'
|
||||
'cores', kind='c')
|
||||
'zuul.nodepool.resources.in_use.project.review_example_com/org/'
|
||||
'project.cores', kind='c')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.project.review_example_com/org/project.'
|
||||
'ram', kind='c')
|
||||
'zuul.nodepool.resources.in_use.project.review_example_com/org/'
|
||||
'project.ram', kind='c')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.resources.project.review_example_com/org/project.'
|
||||
'instances', kind='c')
|
||||
'zuul.nodepool.resources.in_use.project.review_example_com/org/'
|
||||
'project.instances', kind='c')
|
||||
|
||||
self.executor_server.hold_jobs_in_build = False
|
||||
self.executor_server.release()
|
||||
|
|
|
@ -387,7 +387,9 @@ class TestWeb(BaseTestWeb):
|
|||
'label': 'label1',
|
||||
'name': 'controller',
|
||||
'aliases': [],
|
||||
'requestor': None,
|
||||
'state': 'unknown',
|
||||
'tenant_name': None,
|
||||
'user_data': None}],
|
||||
},
|
||||
'override_checkout': None,
|
||||
|
@ -435,7 +437,9 @@ class TestWeb(BaseTestWeb):
|
|||
'label': 'label2',
|
||||
'name': 'controller',
|
||||
'aliases': [],
|
||||
'requestor': None,
|
||||
'state': 'unknown',
|
||||
'tenant_name': None,
|
||||
'user_data': None}],
|
||||
},
|
||||
'override_checkout': None,
|
||||
|
@ -1057,7 +1061,9 @@ class TestWeb(BaseTestWeb):
|
|||
'id': None,
|
||||
'label': 'label1',
|
||||
'name': 'controller',
|
||||
'requestor': None,
|
||||
'state': 'unknown',
|
||||
'tenant_name': None,
|
||||
'user_data': None}]},
|
||||
'override_branch': None,
|
||||
'override_checkout': None,
|
||||
|
|
|
@ -1208,6 +1208,8 @@ class Node(ConfigObject):
|
|||
self.resources = None
|
||||
self.allocated_to = None
|
||||
self.attributes = {}
|
||||
self.tenant_name = None
|
||||
self.requestor = None
|
||||
|
||||
@property
|
||||
def state(self):
|
||||
|
@ -1240,6 +1242,8 @@ class Node(ConfigObject):
|
|||
d['hold_job'] = self.hold_job
|
||||
d['comment'] = self.comment
|
||||
d['user_data'] = self.user_data
|
||||
d['tenant_name'] = self.tenant_name
|
||||
d['requestor'] = self.requestor
|
||||
for k in self._keys:
|
||||
d[k] = getattr(self, k)
|
||||
if internal_attributes:
|
||||
|
|
|
@ -167,11 +167,11 @@ class Nodepool(object):
|
|||
return
|
||||
|
||||
for resource, value in resources.items():
|
||||
key = 'zuul.nodepool.resources.tenant.{tenant}.{resource}'
|
||||
key = 'zuul.nodepool.resources.in_use.tenant.{tenant}.{resource}'
|
||||
self.statsd.incr(
|
||||
key, value * duration, tenant=tenant, resource=resource)
|
||||
for resource, value in resources.items():
|
||||
key = 'zuul.nodepool.resources.project.' \
|
||||
key = 'zuul.nodepool.resources.in_use.project.' \
|
||||
'{project}.{resource}'
|
||||
self.statsd.incr(
|
||||
key, value * duration, project=project, resource=resource)
|
||||
|
@ -315,7 +315,6 @@ class Nodepool(object):
|
|||
log.info("Setting nodeset %s in use", nodeset)
|
||||
user_data = dict(
|
||||
zuul_system=self.system_id,
|
||||
tenant_name=tenant_name,
|
||||
project_name=project_name,
|
||||
)
|
||||
for node in nodeset.getNodes():
|
||||
|
@ -483,16 +482,19 @@ class Nodepool(object):
|
|||
|
||||
total_requests = 0
|
||||
tenant_requests = defaultdict(int)
|
||||
resources_by_tenant = {}
|
||||
resources_by_project = {}
|
||||
in_use_resources_by_project = {}
|
||||
in_use_resources_by_tenant = {}
|
||||
total_resources_by_tenant = {}
|
||||
empty_resource_dict = dict([(k, 0) for k in self.resource_types])
|
||||
|
||||
# Initialize zero values for gauges
|
||||
for tenant in abide.tenants.values():
|
||||
tenant_requests[tenant.name] = 0
|
||||
resources_by_tenant[tenant.name] = empty_resource_dict.copy()
|
||||
in_use_resources_by_tenant[tenant.name] =\
|
||||
empty_resource_dict.copy()
|
||||
total_resources_by_tenant[tenant.name] = empty_resource_dict.copy()
|
||||
for project in tenant.all_projects:
|
||||
resources_by_project[project.canonical_name] =\
|
||||
in_use_resources_by_project[project.canonical_name] =\
|
||||
empty_resource_dict.copy()
|
||||
|
||||
# Count node requests
|
||||
|
@ -511,34 +513,52 @@ class Nodepool(object):
|
|||
tenant=tenant)
|
||||
|
||||
# Count nodes
|
||||
for node in self.getNodes():
|
||||
for node in self.zk_nodepool.nodeIterator(cached=True):
|
||||
if not node.resources:
|
||||
continue
|
||||
project_name = node.user_data.get('project_name')
|
||||
tenant_name = node.user_data.get('tenant_name')
|
||||
if not (project_name and tenant_name):
|
||||
continue
|
||||
|
||||
tenant_name = node.tenant_name
|
||||
if tenant_name in total_resources_by_tenant and \
|
||||
node.requestor == self.system_id:
|
||||
self.addResources(
|
||||
total_resources_by_tenant[tenant_name],
|
||||
node.resources)
|
||||
|
||||
# below here, we are only interested in nodes which are either
|
||||
# in-use, used, or currently held by this zuul system
|
||||
if node.state not in {model.STATE_IN_USE,
|
||||
model.STATE_USED,
|
||||
model.STATE_HOLD}:
|
||||
continue
|
||||
if tenant_name not in resources_by_tenant:
|
||||
if not node.user_data:
|
||||
continue
|
||||
self.addResources(resources_by_tenant[tenant_name],
|
||||
node.resources)
|
||||
if project_name not in resources_by_project:
|
||||
if node.user_data.get('zuul_system') != self.system_id:
|
||||
continue
|
||||
self.addResources(resources_by_project[project_name],
|
||||
node.resources)
|
||||
|
||||
for tenant, resources in resources_by_tenant.items():
|
||||
if tenant_name in in_use_resources_by_tenant:
|
||||
self.addResources(
|
||||
in_use_resources_by_tenant[tenant_name],
|
||||
node.resources)
|
||||
|
||||
project_name = node.user_data.get('project_name')
|
||||
if project_name in in_use_resources_by_project:
|
||||
self.addResources(
|
||||
in_use_resources_by_project[project_name],
|
||||
node.resources)
|
||||
|
||||
for tenant, resources in total_resources_by_tenant.items():
|
||||
for resource, value in resources.items():
|
||||
key = 'zuul.nodepool.resources.tenant.' \
|
||||
key = 'zuul.nodepool.resources.total.tenant.' \
|
||||
'{tenant}.{resource}'
|
||||
self.statsd.gauge(key, value, tenant=tenant, resource=resource)
|
||||
for project, resources in resources_by_project.items():
|
||||
for tenant, resources in in_use_resources_by_tenant.items():
|
||||
for resource, value in resources.items():
|
||||
key = 'zuul.nodepool.resources.project.' \
|
||||
key = 'zuul.nodepool.resources.in_use.tenant.' \
|
||||
'{tenant}.{resource}'
|
||||
self.statsd.gauge(key, value, tenant=tenant, resource=resource)
|
||||
for project, resources in in_use_resources_by_project.items():
|
||||
for resource, value in resources.items():
|
||||
key = 'zuul.nodepool.resources.in_use.project.' \
|
||||
'{project}.{resource}'
|
||||
self.statsd.gauge(
|
||||
key, value, project=project, resource=resource)
|
||||
|
|
|
@ -1250,7 +1250,7 @@ class ZuulWebAPI(object):
|
|||
isinstance(node.user_data, dict) and
|
||||
node.user_data.get('zuul_system') ==
|
||||
self.system.system_id and
|
||||
node.user_data.get('tenant_name') == tenant):
|
||||
node.tenant_name == tenant):
|
||||
continue
|
||||
node_data = {}
|
||||
for key in ("id", "type", "connection_type", "external_id",
|
||||
|
|
|
@ -193,11 +193,11 @@ class ZooKeeperNodepool(ZooKeeperBase):
|
|||
node.stat = stat
|
||||
return node
|
||||
|
||||
def nodeIterator(self):
|
||||
def nodeIterator(self, cached=False):
|
||||
"""
|
||||
Utility generator method for iterating through all nodes.
|
||||
"""
|
||||
for node_id in self.getNodes():
|
||||
for node_id in self.getNodes(cached):
|
||||
node = self.getNode(node_id)
|
||||
if node:
|
||||
yield node
|
||||
|
|
Loading…
Reference in New Issue