318 lines
13 KiB
Python
318 lines
13 KiB
Python
# Copyright 2012 Hewlett-Packard Development Company, L.P.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import logging
|
|
import time
|
|
from uuid import uuid4
|
|
|
|
import zuul.executor.common
|
|
from zuul.lib.logutil import get_annotated_logger
|
|
from zuul.model import (
|
|
Build,
|
|
BuildCompletedEvent,
|
|
BuildRequest,
|
|
BuildStartedEvent,
|
|
PRIORITY_MAP,
|
|
)
|
|
from zuul.zk.event_queues import PipelineResultEventQueue
|
|
from zuul.zk.executor import ExecutorApi
|
|
from zuul.zk.exceptions import JobRequestNotFound
|
|
import zuul.lib.tracing as tracing
|
|
|
|
from kazoo.exceptions import BadVersionError
|
|
from opentelemetry import trace
|
|
|
|
|
|
class ExecutorClient(object):
|
|
log = logging.getLogger("zuul.ExecutorClient")
|
|
_executor_api_class = ExecutorApi
|
|
|
|
def __init__(self, config, sched):
|
|
self.config = config
|
|
self.sched = sched
|
|
|
|
self.executor_api = self._executor_api_class(self.sched.zk_client)
|
|
self.result_events = PipelineResultEventQueue.createRegistry(
|
|
self.sched.zk_client
|
|
)
|
|
|
|
def stop(self):
|
|
self.log.debug("Stopping")
|
|
|
|
def execute(self, job, nodes, item, pipeline, executor_zone,
|
|
dependent_changes=[], merger_items=[]):
|
|
log = get_annotated_logger(self.log, item.event)
|
|
tracer = trace.get_tracer("zuul")
|
|
uuid = str(uuid4().hex)
|
|
log.info(
|
|
"Execute job %s (uuid: %s) on nodes %s for %s "
|
|
"with dependent changes %s",
|
|
job, uuid, nodes, item, dependent_changes)
|
|
|
|
params = zuul.executor.common.construct_build_params(
|
|
uuid, self.sched.connections,
|
|
job, item, pipeline, dependent_changes, merger_items,
|
|
redact_secrets_and_keys=False)
|
|
# TODO: deprecate and remove this variable?
|
|
params["zuul"]["_inheritance_path"] = list(job.inheritance_path)
|
|
|
|
semaphore_handler = item.pipeline.tenant.semaphore_handler
|
|
params['semaphore_handle'] = semaphore_handler.getSemaphoreHandle(
|
|
item, job)
|
|
|
|
parent_span = tracing.restoreSpan(item.current_build_set.span_info)
|
|
execute_time = time.time()
|
|
with trace.use_span(parent_span):
|
|
build_span = tracer.start_span("Build", start_time=execute_time)
|
|
build_span_info = tracing.getSpanInfo(build_span)
|
|
build = Build.new(
|
|
pipeline.manager.current_context,
|
|
job=job,
|
|
build_set=item.current_build_set,
|
|
uuid=uuid,
|
|
execute_time=execute_time,
|
|
span_info=build_span_info,
|
|
zuul_event_id=item.event.zuul_event_id,
|
|
)
|
|
|
|
log.debug("Adding build %s of job %s to item %s",
|
|
build, job, item)
|
|
item.addBuild(job, build)
|
|
|
|
if job.name == 'noop':
|
|
data = {"start_time": time.time()}
|
|
started_event = BuildStartedEvent(
|
|
build.uuid, build.build_set.uuid, job.uuid,
|
|
None, data, zuul_event_id=build.zuul_event_id)
|
|
self.result_events[pipeline.tenant.name][pipeline.name].put(
|
|
started_event
|
|
)
|
|
|
|
result = {"result": "SUCCESS", "end_time": time.time()}
|
|
completed_event = BuildCompletedEvent(
|
|
build.uuid, build.build_set.uuid, job.uuid,
|
|
None, result, zuul_event_id=build.zuul_event_id)
|
|
self.result_events[pipeline.tenant.name][pipeline.name].put(
|
|
completed_event
|
|
)
|
|
|
|
return
|
|
|
|
# Update zuul attempts after addBuild above to ensure build_set
|
|
# is up to date.
|
|
attempts = build.build_set.getTries(job)
|
|
params["zuul"]['attempts'] = attempts
|
|
params['zuul']['max_attempts'] = job.attempts
|
|
# TODO (swestphahl): Remove deprecated 'max_attempts' parameter
|
|
params['max_attempts'] = job.attempts
|
|
|
|
# Store the NodeRequest ID in the job arguments, so we can look it up
|
|
# on the executor side to lock the nodes.
|
|
req_id = build.build_set.getJobNodeRequestID(job)
|
|
if isinstance(req_id, dict):
|
|
# This is a stop-gap. It is possible for this to happen
|
|
# if a queue item completes all its builds and is removed
|
|
# while one of its builds is deduplicated in another queue
|
|
# item in an independent pipeline. The bundle refactor
|
|
# work will remove this possibility at which point this
|
|
# code can be removed. In the mean time, if we encounter
|
|
# this case, restart the build to try to keep things
|
|
# moving.
|
|
self.log.error(
|
|
"Attempt to start build with deduplicated node request ID "
|
|
f"{req_id}")
|
|
data = {"start_time": time.time()}
|
|
started_event = BuildStartedEvent(
|
|
build.uuid, build.build_set.uuid, job.uuid,
|
|
None, data, zuul_event_id=build.zuul_event_id)
|
|
self.result_events[pipeline.tenant.name][pipeline.name].put(
|
|
started_event
|
|
)
|
|
|
|
result = {"result": None, "end_time": time.time()}
|
|
completed_event = BuildCompletedEvent(
|
|
build.uuid, build.build_set.uuid, job.uuid,
|
|
None, result, zuul_event_id=build.zuul_event_id)
|
|
self.result_events[pipeline.tenant.name][pipeline.name].put(
|
|
completed_event
|
|
)
|
|
return
|
|
|
|
if req_id:
|
|
params["noderequest_id"] = req_id
|
|
|
|
zone_known = False
|
|
if executor_zone:
|
|
# Check the component registry for executors subscribed to this
|
|
# zone
|
|
for comp in self.sched.component_registry.all(kind="executor"):
|
|
if comp.zone == executor_zone:
|
|
zone_known = True
|
|
break
|
|
|
|
if not zone_known:
|
|
self.log.warning(
|
|
"Job requested '%s' zuul-executor zone, but no "
|
|
"zuul-executors found for this zone; ignoring zone "
|
|
"request", executor_zone)
|
|
# Fall back to the default zone
|
|
executor_zone = None
|
|
|
|
with trace.use_span(build_span):
|
|
request = BuildRequest(
|
|
uuid=uuid,
|
|
build_set_uuid=build.build_set.uuid,
|
|
job_uuid=job.uuid,
|
|
tenant_name=build.build_set.item.pipeline.tenant.name,
|
|
pipeline_name=build.build_set.item.pipeline.name,
|
|
zone=executor_zone,
|
|
event_id=item.event.zuul_event_id,
|
|
precedence=PRIORITY_MAP[pipeline.precedence],
|
|
)
|
|
self.executor_api.submit(request, params)
|
|
build.updateAttributes(pipeline.manager.current_context,
|
|
build_request_ref=request.path)
|
|
|
|
def cancel(self, build):
|
|
log = get_annotated_logger(self.log, build.zuul_event_id,
|
|
build=build.uuid)
|
|
# Returns whether a running build was canceled
|
|
log.info("Cancel build %s for job %s", build, build.job)
|
|
|
|
build.updateAttributes(
|
|
build.build_set.item.pipeline.manager.current_context,
|
|
canceled=True)
|
|
|
|
if not build.build_request_ref:
|
|
log.debug("Build has not been submitted to ZooKeeper")
|
|
return False
|
|
|
|
build_request = self.executor_api.get(build.build_request_ref)
|
|
if build_request:
|
|
log.debug("Canceling build request %s", build_request)
|
|
# If we can acquire the build request lock here, the build wasn't
|
|
# picked up by any executor server yet. With acquiring the lock
|
|
# we prevent the executor server from picking up the build so we
|
|
# can cancel it before it will run.
|
|
if self.executor_api.lock(build_request, blocking=False):
|
|
log.debug(
|
|
"Canceling build %s directly because it is not locked by "
|
|
"any executor",
|
|
build_request,
|
|
)
|
|
# Mark the build request as complete and forward the event to
|
|
# the scheduler, so the executor server doesn't pick up the
|
|
# request. The build will be deleted from the scheduler when it
|
|
# picks up the BuildCompletedEvent.
|
|
try:
|
|
build_request.state = BuildRequest.COMPLETED
|
|
self.executor_api.update(build_request)
|
|
|
|
result = {"result": "CANCELED", "end_time": time.time()}
|
|
tenant_name = build.build_set.item.pipeline.tenant.name
|
|
pipeline_name = build.build_set.item.pipeline.name
|
|
event = BuildCompletedEvent(
|
|
build_request.uuid, build_request.build_set_uuid,
|
|
build_request.job_uuid,
|
|
build_request.path, result)
|
|
self.result_events[tenant_name][pipeline_name].put(event)
|
|
finally:
|
|
self.executor_api.unlock(build_request)
|
|
else:
|
|
log.debug(
|
|
"Sending cancel request for build %s because it is locked",
|
|
build_request,
|
|
)
|
|
# If the build request is locked, schedule a cancel request in
|
|
# the executor server.
|
|
self.executor_api.requestCancel(build_request)
|
|
|
|
log.debug("Canceled build")
|
|
return True
|
|
|
|
return False
|
|
|
|
def resumeBuild(self, build):
|
|
log = get_annotated_logger(self.log, build.zuul_event_id)
|
|
|
|
if not build.build_request_ref:
|
|
log.debug("Build has not been submitted")
|
|
return False
|
|
|
|
build_request = self.executor_api.get(build.build_request_ref)
|
|
if build_request:
|
|
log.debug("Requesting resume for build %s", build)
|
|
self.executor_api.requestResume(build_request)
|
|
return True
|
|
return False
|
|
|
|
def removeBuild(self, build):
|
|
log = get_annotated_logger(self.log, build.zuul_event_id)
|
|
log.debug("Removing build %s", build.uuid)
|
|
|
|
if not build.build_request_ref:
|
|
log.debug("Build %s has not been submitted to ZooKeeper",
|
|
build.uuid)
|
|
return
|
|
|
|
build_request = self.executor_api.get(build.build_request_ref)
|
|
if build_request:
|
|
# TODO (felix): We could directly remove the build request via
|
|
# its path in ZK to spare a read operation. Usually there should
|
|
# be no need to look up the build request object from ZooKeeper
|
|
# just to immediately remove it.
|
|
self.executor_api.remove(build_request)
|
|
|
|
def cleanupLostBuildRequests(self):
|
|
for build_request in self.executor_api.lostRequests():
|
|
try:
|
|
self.cleanupLostBuildRequest(build_request)
|
|
except Exception:
|
|
self.log.exception("Exception cleaning up lost build request:")
|
|
|
|
def cleanupLostBuildRequest(self, build_request):
|
|
result = {"result": "ABORTED"}
|
|
|
|
# TODO (felix): Once the builds are stored in ZooKeeper, we can store
|
|
# the end_time directly on the build. But for now we have to use the
|
|
# result dict for that.
|
|
result["end_time"] = time.time()
|
|
|
|
build_request.state = BuildRequest.COMPLETED
|
|
try:
|
|
self.executor_api.update(build_request)
|
|
except JobRequestNotFound as e:
|
|
self.log.warning("Could not complete build: %s", str(e))
|
|
# In case we re-created the lock directory, still remove
|
|
# the request for the side effect of removing the lock.
|
|
self.executor_api.remove(build_request)
|
|
return
|
|
except BadVersionError:
|
|
# There could be a race condition:
|
|
# The build is found by lost_builds in state RUNNING
|
|
# but gets completed/unlocked before the is_locked()
|
|
# check. Since we use the znode version, the update
|
|
# will fail in this case and we can simply ignore the
|
|
# exception.
|
|
return
|
|
|
|
# No need to unlock the build, as it is by definition unlocked
|
|
|
|
event = BuildCompletedEvent(
|
|
build_request.uuid, build_request.build_set_uuid,
|
|
build_request.job_uuid,
|
|
build_request.path, result)
|
|
self.result_events[build_request.tenant_name][
|
|
build_request.pipeline_name].put(event)
|