Add nodeset alternatives

This feature instructs Zuul to attempt a second or more node request
with a different node configuration (ie, possibly different labels)
if the first one fails.

It is intended to address the case where a cloud provider is unable
to supply specialized high-performance nodes, and the user would like
the job to proceed anyway on lower-performance nodes.

Change-Id: Idede4244eaa3b21a34c20099214fda6ecdc992df
This commit is contained in:
James E. Blair
2022-09-03 14:03:36 -07:00
parent 64fa9db579
commit 1958bbad03
14 changed files with 587 additions and 56 deletions

View File

@@ -880,23 +880,28 @@ class PipelineManager(metaclass=ABCMeta):
else:
relative_priority = 0
for job in jobs:
provider = self._getPausedParentProvider(build_set, job)
priority = self._calculateNodeRequestPriority(build_set, job)
tenant_name = build_set.item.pipeline.tenant.name
pipeline_name = build_set.item.pipeline.name
req = self.sched.nodepool.requestNodes(
build_set.uuid, job, tenant_name, pipeline_name, provider,
priority, relative_priority, event=item.event)
log.debug("Adding node request %s for job %s to item %s",
req, job, item)
build_set.setJobNodeRequestID(job.name, req.id)
if req.fulfilled:
nodeset = self.sched.nodepool.getNodeSet(req, job.nodeset)
build_set.jobNodeRequestComplete(req.job_name, nodeset)
else:
job.setWaitingStatus(f'node request: {req.id}')
self._makeNodepoolRequest(log, build_set, job, relative_priority)
return True
def _makeNodepoolRequest(self, log, build_set, job, relative_priority,
alternative=0):
provider = self._getPausedParentProvider(build_set, job)
priority = self._calculateNodeRequestPriority(build_set, job)
tenant_name = build_set.item.pipeline.tenant.name
pipeline_name = build_set.item.pipeline.name
item = build_set.item
req = self.sched.nodepool.requestNodes(
build_set.uuid, job, tenant_name, pipeline_name, provider,
priority, relative_priority, event=item.event)
log.debug("Adding node request %s for job %s to item %s",
req, job, item)
build_set.setJobNodeRequestID(job.name, req.id)
if req.fulfilled:
nodeset = self.sched.nodepool.getNodeSet(req, job.nodeset)
build_set.jobNodeRequestComplete(req.job_name, nodeset)
else:
job.setWaitingStatus(f'node request: {req.id}')
def _getPausedParent(self, build_set, job):
job_graph = build_set.job_graph
if job_graph:
@@ -1909,17 +1914,46 @@ class PipelineManager(metaclass=ABCMeta):
build_set.setExtraRepoState(event.repo_state)
build_set.repo_state_state = build_set.COMPLETE
def _handleNodeRequestFallback(self, log, build_set, job, old_request):
if len(job.nodeset_alternatives) <= job.nodeset_index + 1:
# No alternatives to fall back upon
return False
# Increment the nodeset index and remove the old request
with job.activeContext(self.current_context):
job.nodeset_index = job.nodeset_index + 1
log.info("Re-attempting node request for job "
f"{job.name} of item {build_set.item} "
f"with nodeset alternative {job.nodeset_index}")
build_set.removeJobNodeRequestID(job.name)
# Make a new request
if self.sched.globals.use_relative_priority:
relative_priority = build_set.item.getNodePriority()
else:
relative_priority = 0
log = build_set.item.annotateLogger(self.log)
self._makeNodepoolRequest(log, build_set, job, relative_priority)
return True
def onNodesProvisioned(self, request, nodeset, build_set):
# TODOv3(jeblair): handle provisioning failure here
log = get_annotated_logger(self.log, request.event_id)
self.reportPipelineTiming('node_request_time', request.created_time)
if nodeset is not None:
build_set.jobNodeRequestComplete(request.job_name, nodeset)
job = build_set.item.getJob(request.job_name)
# First see if we need to retry the request
if not request.fulfilled:
log.info("Node request %s: failure for %s",
request, request.job_name)
job = build_set.item.getJob(request.job_name)
if self._handleNodeRequestFallback(log, build_set, job, request):
return
# No more fallbacks -- tell the buildset the request is complete
if nodeset is not None:
build_set.jobNodeRequestComplete(request.job_name, nodeset)
# Put a fake build through the cycle to clean it up.
if not request.fulfilled:
fakebuild = build_set.item.setNodeRequestFailure(job)
try:
self.sql.reportBuildEnd(