zuul/zuul/nodepool.py
Simon Westphahl e41af7c312
Add job name back to node request data
With the circular dependency refactoring we also removed the job name
from the requestor data in the node request. However, this could
previously be used as part of the dynamic-tags in Nodepool which might
be useful for billing and cost calculations.

Add back the job name so those use-cases start working again.

Change-Id: Ie3be39819bf84d05a7427cd0e859f485de90835d
2024-03-07 08:02:30 +01:00

573 lines
23 KiB
Python

# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import logging
import threading
import time
from collections import defaultdict
from zuul import model
from zuul.lib import tracing
from zuul.lib.logutil import get_annotated_logger
from zuul.zk.event_queues import (
PipelineResultEventQueue,
NodepoolEventElection
)
from zuul.zk.exceptions import LockException
from zuul.zk.nodepool import NodeRequestEvent, ZooKeeperNodepool
class Nodepool(object):
log = logging.getLogger('zuul.nodepool')
# The kind of resources we report stats on. We need a complete
# list in order to report 0 level gauges.
resource_types = ('ram', 'cores', 'instances')
def __init__(self, zk_client, system_id, statsd, scheduler=False):
self._stopped = False
self.system_id = system_id
self.statsd = statsd
self.election_won = False
if scheduler:
# Only enable the node request cache/callback for the scheduler.
self.stop_watcher_event = threading.Event()
self.zk_nodepool = ZooKeeperNodepool(
zk_client,
enable_node_request_cache=True,
node_request_event_callback=self._handleNodeRequestEvent,
connection_suspended_callback=self.stop_watcher_event,
enable_node_cache=True)
self.election = NodepoolEventElection(zk_client)
self.event_thread = threading.Thread(target=self.runEventElection)
self.event_thread.daemon = True
self.event_thread.start()
else:
self.stop_watcher_event = None
self.zk_nodepool = ZooKeeperNodepool(zk_client)
self.election = None
self.event_thread = None
self.pipeline_result_events = PipelineResultEventQueue.createRegistry(
zk_client
)
def addResources(self, target, source):
for key, value in source.items():
if key in self.resource_types:
target[key] += value
def runEventElection(self):
while not self._stopped:
try:
self.log.debug("Running nodepool watcher election")
self.election.run(self._electionWon)
except Exception:
self.log.exception("Error in nodepool watcher:")
def stop(self):
self.log.debug("Stopping")
self._stopped = True
if self.election:
self.election.cancel()
self.stop_watcher_event.set()
self.event_thread.join()
# Delete the election to avoid a FD leak in tests.
del self.election
def _sendNodesProvisionedEvent(self, request):
tracing.endSavedSpan(request.span_info, attributes={
"request_id": request.id,
"state": request.state,
})
tenant_name = request.tenant_name
pipeline_name = request.pipeline_name
event = model.NodesProvisionedEvent(request.id, request.build_set_uuid)
self.pipeline_result_events[tenant_name][pipeline_name].put(event)
def _electionWon(self):
self.log.info("Watching nodepool requests")
# Iterate over every completed request in case we are starting
# up or missed something in the transition.
self.election_won = True
try:
for rid in self.zk_nodepool.getNodeRequests():
request = self.zk_nodepool.getNodeRequest(rid, cached=True)
if request.requestor != self.system_id:
continue
if (request.state in {model.STATE_FULFILLED,
model.STATE_FAILED}):
self._sendNodesProvisionedEvent(request)
# Now resume normal event processing.
self.stop_watcher_event.wait()
finally:
self.stop_watcher_event.clear()
self.election_won = False
def _handleNodeRequestEvent(self, request, event):
log = get_annotated_logger(self.log, event=request.event_id)
if request.requestor != self.system_id:
return
log.debug("Node request %s %s", request, request.state)
if event == NodeRequestEvent.COMPLETED:
try:
if self.election_won:
if self.election.is_still_valid():
self.emitStats(request)
self._sendNodesProvisionedEvent(request)
else:
self.stop_watcher_event.set()
except Exception:
# If there are any errors moving the event, re-run the
# election.
if self.stop_watcher_event is not None:
self.stop_watcher_event.set()
raise
def emitStats(self, request):
# Implements the following :
# counter zuul.nodepool.requests.<state>.total
# counter zuul.nodepool.requests.<state>.label.<label>
# counter zuul.nodepool.requests.<state>.size.<size>
# timer zuul.nodepool.requests.(fulfilled|failed)
# timer zuul.nodepool.requests.(fulfilled|failed).<label>
# timer zuul.nodepool.requests.(fulfilled|failed).<size>
if not self.statsd:
return
pipe = self.statsd.pipeline()
state = request.state
dt = None
if request.canceled:
state = 'canceled'
elif request.state in (model.STATE_FULFILLED, model.STATE_FAILED):
dt = (request.state_time - request.requested_time) * 1000
key = 'zuul.nodepool.requests.%s' % state
pipe.incr(key + ".total")
if dt:
pipe.timing(key, dt)
for label in request.labels:
pipe.incr(key + '.label.%s' % label)
if dt:
pipe.timing(key + '.label.%s' % label, dt)
pipe.incr(key + '.size.%s' % len(request.labels))
if dt:
pipe.timing(key + '.size.%s' % len(request.labels), dt)
pipe.send()
def emitStatsResourceCounters(self, tenant, project, resources, duration):
if not self.statsd:
return
for resource, value in resources.items():
key = 'zuul.nodepool.resources.in_use.tenant.{tenant}.{resource}'
self.statsd.incr(
key, value * duration, tenant=tenant, resource=resource)
for resource, value in resources.items():
key = 'zuul.nodepool.resources.in_use.project.' \
'{project}.{resource}'
self.statsd.incr(
key, value * duration, project=project, resource=resource)
def requestNodes(self, build_set_uuid, job, tenant_name, pipeline_name,
provider, priority, relative_priority, event=None):
log = get_annotated_logger(self.log, event)
labels = [n.label for n in job.nodeset.getNodes()]
if event:
event_id = event.zuul_event_id
else:
event_id = None
req = model.NodeRequest(self.system_id, build_set_uuid, tenant_name,
pipeline_name, job.uuid, job.name, labels,
provider, relative_priority, event_id)
if job.nodeset.nodes:
self.zk_nodepool.submitNodeRequest(req, priority)
# Logged after submission so that we have the request id
log.info("Submitted node request %s", req)
self.emitStats(req)
else:
# Directly fulfill the node request without submitting it to ZK,
# so nodepool doesn't have to deal with it.
log.info("Fulfilling empty node request %s", req)
req.state = model.STATE_FULFILLED
return req
def cancelRequest(self, request):
log = get_annotated_logger(self.log, request.event_id)
log.info("Canceling node request %s", request)
try:
request.canceled = True
if self.zk_nodepool.deleteNodeRequest(request.id):
self.emitStats(request)
except Exception:
log.exception("Error deleting node request:")
def reviseRequest(self, request, relative_priority=None):
'''Attempt to update the node request, if it is not currently being
processed.
:param: NodeRequest request: The request to update.
:param relative_priority int: If supplied, the new relative
priority to set on the request.
'''
log = get_annotated_logger(self.log, request.event_id)
if relative_priority is None:
return
try:
self.zk_nodepool.lockNodeRequest(request, blocking=False)
except LockException:
# It may be locked by nodepool, which is fine.
log.debug("Unable to revise locked node request %s", request)
return False
try:
old_priority = request.relative_priority
request.relative_priority = relative_priority
self.zk_nodepool.storeNodeRequest(request)
log.debug("Revised relative priority of "
"node request %s from %s to %s",
request, old_priority, relative_priority)
except Exception:
log.exception("Unable to update node request %s", request)
finally:
try:
self.zk_nodepool.unlockNodeRequest(request)
except Exception:
log.exception("Unable to unlock node request %s", request)
def holdNodeSet(self, nodeset, request, build, duration,
zuul_event_id=None):
'''
Perform a hold on the given set of nodes.
:param NodeSet nodeset: The object containing the set of nodes to hold.
:param HoldRequest request: Hold request associated with the NodeSet
'''
log = get_annotated_logger(self.log, zuul_event_id)
log.info("Holding nodeset %s", nodeset)
resources = defaultdict(int)
nodes = nodeset.getNodes()
log.info(
"Nodeset %s with %s nodes was in use for %s seconds for build %s "
"for project %s",
nodeset, len(nodeset.nodes), duration, build, request.project)
for node in nodes:
if node.lock is None:
raise Exception(f"Node {node} is not locked")
if node.resources:
self.addResources(resources, node.resources)
node.state = model.STATE_HOLD
node.hold_job = " ".join([request.tenant,
request.project,
request.job,
request.ref_filter])
node.comment = request.reason
if request.node_expiration:
node.hold_expiration = request.node_expiration
self.zk_nodepool.storeNode(node)
request.nodes.append(dict(
build=build.uuid,
nodes=[node.id for node in nodes],
))
request.current_count += 1
# Request has been used at least the maximum number of times so set
# the expiration time so that it can be auto-deleted.
if request.current_count >= request.max_count and not request.expired:
request.expired = time.time()
# Give ourselves a few seconds to try to obtain the lock rather than
# immediately give up.
self.zk_nodepool.lockHoldRequest(request, timeout=5)
try:
self.zk_nodepool.storeHoldRequest(request)
except Exception:
# If we fail to update the request count, we won't consider it
# a real autohold error by passing the exception up. It will
# just get used more than the original count specified.
# It's possible to leak some held nodes, though, which would
# require manual node deletes.
log.exception("Unable to update hold request %s:", request)
finally:
# Although any exceptions thrown here are handled higher up in
# _doBuildCompletedEvent, we always want to try to unlock it.
self.zk_nodepool.unlockHoldRequest(request)
if resources and duration:
self.emitStatsResourceCounters(
request.tenant, request.project, resources, duration)
def useNodeSet(self, nodeset, tenant_name, project_name,
zuul_event_id=None):
log = get_annotated_logger(self.log, zuul_event_id)
log.info("Setting nodeset %s in use", nodeset)
user_data = dict(
zuul_system=self.system_id,
project_name=project_name,
)
for node in nodeset.getNodes():
if node.lock is None:
raise Exception("Node %s is not locked", node)
node.state = model.STATE_IN_USE
node.user_data = user_data
self.zk_nodepool.storeNode(node)
def returnNodeSet(self, nodeset, build, tenant_name, project_name,
duration, zuul_event_id=None):
log = get_annotated_logger(self.log, zuul_event_id)
log.info("Returning nodeset %s", nodeset)
resources = defaultdict(int)
for node in nodeset.getNodes():
if node.lock is None:
log.error("Node %s is not locked", node)
else:
try:
if node.state == model.STATE_IN_USE:
if node.resources:
self.addResources(resources, node.resources)
node.state = model.STATE_USED
self.zk_nodepool.storeNode(node)
except Exception:
log.exception("Exception storing node %s "
"while unlocking:", node)
self.unlockNodeSet(nodeset)
log.info("Nodeset %s with %s nodes was in use "
"for %s seconds for build %s for project %s",
nodeset, len(nodeset.nodes), duration, build, project_name)
if resources and duration:
self.emitStatsResourceCounters(
tenant_name, project_name, resources, duration)
def unlockNodeSet(self, nodeset):
self._unlockNodes(nodeset.getNodes())
def _unlockNodes(self, nodes):
for node in nodes:
try:
self.zk_nodepool.unlockNode(node)
except Exception:
self.log.exception("Error unlocking node:")
def lockNodes(self, request, nodeset):
log = get_annotated_logger(self.log, event=request.event_id)
# Try to lock all of the supplied nodes. If any lock fails,
# try to unlock any which have already been locked before
# re-raising the error.
locked_nodes = []
try:
for node_id, node in zip(request.nodes, nodeset.getNodes()):
self.zk_nodepool.updateNode(node, node_id)
if node.allocated_to != request.id:
raise Exception("Node %s allocated to %s, not %s" %
(node.id, node.allocated_to, request.id))
log.debug("Locking node %s", node)
self.zk_nodepool.lockNode(node, timeout=30)
# Check the allocated_to again to ensure that nodepool didn't
# re-allocate the nodes to a different node request while we
# were locking them.
if node.allocated_to != request.id:
raise Exception(
"Node %s was reallocated during locking %s, not %s" %
(node.id, node.allocated_to, request.id))
locked_nodes.append(node)
except Exception:
log.exception("Error locking nodes:")
self._unlockNodes(locked_nodes)
raise
def getNodeSet(self, request, job_nodeset):
"""
Get a NodeSet object with info about real nodes.
:param NodeRequest request: The fulfilled NodeRequest
:param NodeSet job_nodeset: The NodeSet object attached to the job
:returns: A new NodeSet object which contains information from
nodepool about the actual allocated nodes.
"""
# A copy of the nodeset with information about the real nodes
nodeset = job_nodeset.copy()
# If we didn't request nodes and the request is fulfilled then just
# return. We don't have to do anything in this case.
if not request.labels and request.fulfilled:
return nodeset
# Load the node info from ZK.
for node_id, node in zip(request.nodes, nodeset.getNodes()):
self.zk_nodepool.updateNode(node, node_id)
return nodeset
def acceptNodes(self, request, nodeset):
# Accept the nodes supplied by request, mutate nodeset with
# the real node information.
locked = False
if request.fulfilled:
# If the request succeeded, try to lock the nodes.
try:
nodes = self.lockNodes(request, nodeset)
locked = True
except Exception:
log = get_annotated_logger(self.log, request.event_id)
log.exception("Error locking nodes:")
request.failed = True
# Regardless of whether locking (or even the request)
# succeeded, delete the request.
if not self.deleteNodeRequest(request.id, locked, request.event_id):
request.failed = True
self.unlockNodeSet(request.nodeset)
if request.failed:
raise Exception("Accepting nodes failed")
return nodes
def deleteNodeRequest(self, request_id, locked=False, event_id=None):
log = get_annotated_logger(self.log, event_id)
log.debug("Deleting node request %s", request_id)
try:
self.zk_nodepool.deleteNodeRequest(request_id)
except Exception:
log.exception("Error deleting node request:")
return False
return True
def getNodeRequests(self):
"""Get all node requests submitted by Zuul
Note this relies entirely on the internal cache.
:returns: An iterator of NodeRequest objects created by this Zuul
system.
"""
for req_id in self.zk_nodepool.getNodeRequests(cached=True):
req = self.zk_nodepool.getNodeRequest(req_id, cached=True)
if req.requestor == self.system_id:
yield req
def getNodes(self):
"""Get all nodes in use or held by Zuul
Note this relies entirely on the internal cache.
:returns: An iterator of Node objects in use (or held) by this Zuul
system.
"""
for node_id in self.zk_nodepool.getNodes(cached=True):
node = self.zk_nodepool.getNode(node_id)
if (node.user_data and
isinstance(node.user_data, dict) and
node.user_data.get('zuul_system') == self.system_id):
yield node
def emitStatsTotals(self, abide):
if not self.statsd:
return
total_requests = 0
tenant_requests = defaultdict(int)
in_use_resources_by_project = {}
in_use_resources_by_tenant = {}
total_resources_by_tenant = {}
empty_resource_dict = dict([(k, 0) for k in self.resource_types])
# Initialize zero values for gauges
for tenant in abide.tenants.values():
tenant_requests[tenant.name] = 0
in_use_resources_by_tenant[tenant.name] =\
empty_resource_dict.copy()
total_resources_by_tenant[tenant.name] = empty_resource_dict.copy()
for project in tenant.all_projects:
in_use_resources_by_project[project.canonical_name] =\
empty_resource_dict.copy()
# Count node requests
for req in self.getNodeRequests():
total_requests += 1
if not req.tenant_name:
continue
tenant_requests[req.tenant_name] += 1
self.statsd.gauge('zuul.nodepool.current_requests',
total_requests)
for tenant, request_count in tenant_requests.items():
self.statsd.gauge(
"zuul.nodepool.tenant.{tenant}.current_requests",
request_count,
tenant=tenant)
# Count nodes
for node in self.zk_nodepool.nodeIterator(cached=True):
if not node.resources:
continue
tenant_name = node.tenant_name
if tenant_name in total_resources_by_tenant and \
node.requestor == self.system_id:
self.addResources(
total_resources_by_tenant[tenant_name],
node.resources)
# below here, we are only interested in nodes which are either
# in-use, used, or currently held by this zuul system
if node.state not in {model.STATE_IN_USE,
model.STATE_USED,
model.STATE_HOLD}:
continue
if not node.user_data:
continue
if not isinstance(node.user_data, dict):
continue
if node.user_data.get('zuul_system') != self.system_id:
continue
if tenant_name in in_use_resources_by_tenant:
self.addResources(
in_use_resources_by_tenant[tenant_name],
node.resources)
project_name = node.user_data.get('project_name')
if project_name in in_use_resources_by_project:
self.addResources(
in_use_resources_by_project[project_name],
node.resources)
for tenant, resources in total_resources_by_tenant.items():
for resource, value in resources.items():
key = 'zuul.nodepool.resources.total.tenant.' \
'{tenant}.{resource}'
self.statsd.gauge(key, value, tenant=tenant, resource=resource)
for tenant, resources in in_use_resources_by_tenant.items():
for resource, value in resources.items():
key = 'zuul.nodepool.resources.in_use.tenant.' \
'{tenant}.{resource}'
self.statsd.gauge(key, value, tenant=tenant, resource=resource)
for project, resources in in_use_resources_by_project.items():
for resource, value in resources.items():
key = 'zuul.nodepool.resources.in_use.project.' \
'{project}.{resource}'
self.statsd.gauge(
key, value, project=project, resource=resource)