Merge "Try harder to unlock failed build requests"

2021-09-08 18:34:48 +00:00 · 2021-09-08 18:34:48 +00:00 · 5022764379
parent fa647d86fd 6fcde31c9e
commit 5022764379
7 changed files with 293 additions and 67 deletions
--- a/tests/unit/test_executor.py
+++ b/tests/unit/test_executor.py
@ -1,5 +1,6 @@
 # Copyright 2012 Hewlett-Packard Development Company, L.P.
 # Copyright 2014 Wikimedia Foundation Inc.
+# Copyright 2021 Acme Gating, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may
 # not use this file except in compliance with the License. You may obtain
@ -18,6 +19,7 @@ import logging
 import configparser
 import multiprocessing
 import os
+import re
 import time
 from unittest import mock

@ -1060,3 +1062,18 @@ class TestVarSquash(BaseTestCase):
                'extra': 'extravar_extra'},
        }
        self.assertEqual(out, expected)
+
+
+class TestExecutorFailure(ZuulTestCase):
+    tenant_config_file = 'config/single-tenant/main.yaml'
+
+    @mock.patch('zuul.executor.server.ExecutorServer.executeJob')
+    def test_executor_job_start_failure(self, execute_job_mock):
+        execute_job_mock.side_effect = Exception('Failed to start')
+        A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A')
+        A.addApproval('Code-Review', 2)
+        self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
+        self.waitUntilSettled()
+        self.assertTrue(re.search(
+            '- project-merge .* ERROR',
+            A.messages[-1]))
--- a/tests/unit/test_zk.py
+++ b/tests/unit/test_zk.py
@ -1042,8 +1042,8 @@ class TestMergerApi(ZooKeeperBaseTestCase):
        b.state = MergeRequest.RUNNING
        merger_api.update(b)

-        c.state = MergeRequest.RUNNING
        merger_api.lock(c)
+        c.state = MergeRequest.RUNNING
        merger_api.update(c)

        d.state = MergeRequest.COMPLETED
--- a/zuul/executor/server.py
+++ b/zuul/executor/server.py
@ -1,4 +1,5 @@
 # Copyright 2014 OpenStack Foundation
+# Copyright 2021 Acme Gating, LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may
 # not use this file except in compliance with the License. You may obtain
@ -31,6 +32,7 @@ import time
 import traceback
 from concurrent.futures.process import ProcessPoolExecutor, BrokenProcessPool

+from kazoo.exceptions import NoNodeError
 import git
 from urllib.parse import urlsplit

@ -3544,16 +3546,39 @@ class ExecutorServer(BaseMergeServer):
        log = get_annotated_logger(
            self.log, event=None, build=build_request.uuid
        )
+        # Lock and update the build request
        if not self.executor_api.lock(build_request, blocking=False):
            return

+        # Ensure that the request is still in state requested. This method is
+        # called based on cached data and there might be a mismatch between the
+        # cached state and the real state of the request. The lock might
+        # have been successful because the request is already completed and
+        # thus unlocked.
+        if build_request.state != BuildRequest.REQUESTED:
+            self._retry(build_request.lock, log, self.executor_api.unlock,
+                        build_request)
+
        try:
-            build_request.state = BuildRequest.RUNNING
            params = self.executor_api.getParams(build_request)
-            self.executor_api.clearParams(build_request)
-            # Directly update the build in ZooKeeper, so we don't
-            # loop over and try to lock it again and again.
+            # Directly update the build in ZooKeeper, so we don't loop
+            # over and try to lock it again and again.  Do this before
+            # clearing the params so if we fail, no one tries to
+            # re-run the job.
+            build_request.state = BuildRequest.RUNNING
            self.executor_api.update(build_request)
+        except Exception:
+            log.exception("Exception while preparing to start worker")
+            # If we failed at this point, we have not written anything
+            # to ZK yet; the only thing we need to do is to ensure
+            # that we release the lock, and another executor will be
+            # able to grab the build.
+            self._retry(build_request.lock, log, self.executor_api.unlock,
+                        build_request)
+            return
+
+        try:
+            self.executor_api.clearParams(build_request)
            log.debug("Next executed job: %s", build_request)
            self.executeJob(build_request, params)
        except Exception:
@ -3561,7 +3586,7 @@ class ExecutorServer(BaseMergeServer):
            # sucessfuly start executing the job, it's the
            # AnsibleJob's responsibility to call completeBuild and
            # unlock the request.
-            log.exception("Exception while running job")
+            log.exception("Exception while starting worker")
            result = {
                "result": "ERROR",
                "exception": traceback.format_exc(),
@ -3833,27 +3858,29 @@ class ExecutorServer(BaseMergeServer):
        # result dict for that.
        result["end_time"] = time.time()

+        log = get_annotated_logger(self.log, build_request.event_id,
+                                   build=build_request.uuid)
+
        # NOTE (felix): We store the end_time on the ansible job to calculate
        # the in-use duration of locked nodes when the nodeset is returned.
-        ansible_job = self.job_workers[build_request.uuid]
+        # NOTE: this method may be called before we create a job worker.
+        ansible_job = self.job_workers.get(build_request.uuid)
+        if ansible_job:
            ansible_job.end_time = time.monotonic()

            params = ansible_job.arguments
-        # If the result is None, check if the build has reached its max
-        # attempts and if so set the result to RETRY_LIMIT.
-        # This must be done in order to correctly process the autohold in the
-        # next step. Since we only want to hold the node if the build has
-        # reached a final result.
+            # If the result is None, check if the build has reached
+            # its max attempts and if so set the result to
+            # RETRY_LIMIT.  This must be done in order to correctly
+            # process the autohold in the next step. Since we only
+            # want to hold the node if the build has reached a final
+            # result.
            if result.get("result") is None:
                attempts = params["zuul"]["attempts"]
                max_attempts = params["max_attempts"]
                if attempts >= max_attempts:
                    result["result"] = "RETRY_LIMIT"

-        zuul_event_id = params["zuul_event_id"]
-        log = get_annotated_logger(self.log, zuul_event_id,
-                                   build=build_request.uuid)
-
            # Provide the hold information back to the scheduler via the build
            # result.
            try:
@ -3863,16 +3890,63 @@ class ExecutorServer(BaseMergeServer):
            except Exception:
                log.exception("Unable to process autohold for %s", ansible_job)

-        build_request.state = BuildRequest.COMPLETED
+        def update_build_request(log, build_request):
            try:
                self.executor_api.update(build_request)
+                return True
            except JobRequestNotFound as e:
-            self.log.warning("Could not complete build: %s", str(e))
-            return
+                log.warning("Could not find build: %s", str(e))
+                return False

-        # Unlock the build request
-        self.executor_api.unlock(build_request)
-
-        event = BuildCompletedEvent(build_request.uuid, result)
+        def put_complete_event(log, build_request, event):
+            try:
                self.result_events[build_request.tenant_name][
                    build_request.pipeline_name].put(event)
+            except NoNodeError:
+                log.warning("Pipeline was removed: %s",
+                            build_request.pipeline_name)
+
+        build_request.state = BuildRequest.COMPLETED
+        found = self._retry(build_request.lock, log,
+                            update_build_request, log, build_request)
+        lock_valid = build_request.lock.is_still_valid()
+        if lock_valid:
+            # We only need to unlock if we're still locked.
+            self._retry(build_request.lock, log, self.executor_api.unlock,
+                        build_request)
+
+        if not found:
+            # If the build request is gone, don't return a result.
+            return
+
+        if not lock_valid:
+            # If we lost the lock at any point before updating the
+            # state to COMPLETED, then the scheduler may have (or
+            # will) detect it as an incomplete build and generate an
+            # error event for us.  We don't need to submit a
+            # completion event in that case.
+            #
+            # TODO: If we make the scheduler robust against receiving
+            # duplicate completion events for the same build, we could
+            # choose continue here and submit the completion event in
+            # the hopes that we would win the race against the cleanup
+            # thread.  That might (in some narrow circumstances)
+            # rescue an otherwise acceptable build from being
+            # discarded.
+            return
+
+        # TODO: This is racy.  Once we have set the build request to
+        # completed, the only way for it to be deleted is for the
+        # scheduler to process a BuildRequestCompleted event.  So we
+        # need to try really hard to give it one.  But if we exit
+        # between the section above and the section below, we won't,
+        # which will mean that the scheduler will not automatically
+        # delete the build request and we will not be able to recover.
+        #
+        # This is essentially a two-phase commit problem, but we are
+        # unable to use transactions because the result event is
+        # sharded.  We should be able to redesign the result reporting
+        # mechanism to eliminate the race and be more convergent.
+        event = BuildCompletedEvent(build_request.uuid, result)
+        self._retry(None, log, put_complete_event, log,
+                    build_request, event)
--- a/zuul/merger/server.py
+++ b/zuul/merger/server.py
@ -22,6 +22,8 @@ import time
 from abc import ABCMeta
 from configparser import ConfigParser

+from kazoo.exceptions import NoNodeError
+
 from zuul.lib import commandsocket
 from zuul.lib.config import get_default
 from zuul.lib.logutil import get_annotated_logger
@ -204,14 +206,34 @@ class BaseMergeServer(metaclass=ABCMeta):
        log = get_annotated_logger(
            self.log, merge_request.event_id
        )
-
+        # Lock and update the merge request
        if not self.merger_api.lock(merge_request, blocking=False):
            return

+        # Ensure that the request is still in state requested. This method is
+        # called based on cached data and there might be a mismatch between the
+        # cached state and the real state of the request. The lock might
+        # have been successful because the request is already completed and
+        # thus unlocked.
+        if merge_request.state != MergeRequest.REQUESTED:
+            self._retry(merge_request.lock, log, self.merger_api.unlock,
+                        merge_request)
+
+        try:
+            merge_request.state = MergeRequest.RUNNING
+            params = self.merger_api.getParams(merge_request)
+        except Exception:
+            log.exception("Exception while preparing to start merge job")
+            # If we failed at this point, we have not written anything
+            # to ZK yet; the only thing we need to do is to ensure
+            # that we release the lock, and another merger will be
+            # able to grab the request.
+            self._retry(merge_request.lock, log, self.merger_api.unlock,
+                        merge_request)
+            return
+
        result = None
        try:
-            merge_request.state = MergeRequest.RUNNING
-            params = self.merger_api.getParams(merge_request)
            self.merger_api.clearParams(merge_request)
            # Directly update the merge request in ZooKeeper, so we
            # don't loop over and try to lock it again and again.
@ -344,13 +366,18 @@ class BaseMergeServer(metaclass=ABCMeta):
            item_in_branches,
        )

+        lock_valid = merge_request.lock.is_still_valid()
+        if not lock_valid:
+            return
+
        # Provide a result either via a result future or a result event
        if merge_request.result_path:
            log.debug(
                "Providing synchronous result via future for %s",
                merge_request,
            )
-            self.merger_api.reportResult(merge_request, result)
+            self._retry(merge_request.lock, log,
+                        self.merger_api.reportResult, merge_request, result)

        elif merge_request.build_set_uuid:
            log.debug(
@ -373,10 +400,16 @@ class BaseMergeServer(metaclass=ABCMeta):
                    item_in_branches,
                )

-            tenant_name = merge_request.tenant_name
-            pipeline_name = merge_request.pipeline_name
+            def put_complete_event(log, merge_request, event):
+                try:
+                    self.result_events[merge_request.tenant_name][
+                        merge_request.pipeline_name].put(event)
+                except NoNodeError:
+                    log.warning("Pipeline was removed: %s",
+                                merge_request.pipeline_name)

-            self.result_events[tenant_name][pipeline_name].put(event)
+            self._retry(merge_request.lock, log,
+                        put_complete_event, log, merge_request, event)

        # Set the merge request to completed, unlock and delete it. Although
        # the state update is mainly for consistency reasons, it might come in
@ -384,12 +417,43 @@ class BaseMergeServer(metaclass=ABCMeta):
        # the merge request was already processed and we have a result in the
        # result queue.
        merge_request.state = MergeRequest.COMPLETED
-        self.merger_api.update(merge_request)
-        self.merger_api.unlock(merge_request)
+        self._retry(merge_request.lock, log,
+                    self.merger_api.update, merge_request)
+        self._retry(merge_request.lock, log,
+                    self.merger_api.unlock, merge_request)
        # TODO (felix): If we want to optimize ZK requests, we could only call
        # the remove() here.
        self.merger_api.remove(merge_request)

+    def _retry(self, lock, log, fn, *args, **kw):
+        """Retry a method to deal with ZK connection issues
+
+        This is a helper method to retry ZK operations as long as it
+        makes sense to do so.  If we have encountered a suspended
+        connection, we can probably just retry the ZK operation until
+        it succeeds.  If we have fully lost the connection, then we
+        have lost the lock, so we may not care in that case.
+
+        This method exits when one of the following occurs:
+
+        * The callable function completes.
+        * This server stops.
+        * The lock (if supplied) is invalidated due to connection loss.
+
+        Pass None as the lock parameter if the lock issue is not
+        relevant.
+        """
+        while True:
+            if lock and not lock.is_still_valid():
+                return
+            try:
+                return fn(*args, **kw)
+            except Exception:
+                log.exception("Exception retrying %s", fn)
+            if not self._running:
+                return
+            time.sleep(5)
+

 class MergeServer(BaseMergeServer):
    log = logging.getLogger("zuul.MergeServer")
--- a/zuul/model.py
+++ b/zuul/model.py
@ -2113,6 +2113,11 @@ class JobRequest:
            "result_path": self.result_path,
        }

+    def updateFromDict(self, data):
+        self.precedence = data["precedence"]
+        self.state = data["state"]
+        self.result_path = data["result_path"]
+
    @classmethod
    def fromDict(cls, data):
        return cls(
--- a/zuul/zk/job_request_queue.py
+++ b/zuul/zk/job_request_queue.py
@ -21,7 +21,6 @@ from enum import Enum

 from kazoo.exceptions import LockTimeout, NoNodeError
 from kazoo.protocol.states import EventType
-from kazoo.recipe.lock import Lock

 from zuul.lib.jsonutil import json_dumps
 from zuul.lib.logutil import get_annotated_logger
@ -30,6 +29,7 @@ from zuul.zk import ZooKeeperSimpleBase, sharding
 from zuul.zk.event_queues import JobResultFuture
 from zuul.zk.exceptions import JobRequestNotFound
 from zuul.zk.vendor.watchers import ExistingDataWatch
+from zuul.zk.locks import SessionAwareLock


 class JobRequestEvent(Enum):
@ -292,6 +292,23 @@ class JobRequestQueue(ZooKeeperSimpleBase):

        return request

+    def refresh(self, request):
+        """Refreshs a request object with the current data from ZooKeeper. """
+        try:
+            data, zstat = self.kazoo_client.get(request.path)
+        except NoNodeError:
+            raise JobRequestNotFound(
+                f"Could not refresh {request}, ZooKeeper node is missing")
+
+        if not data:
+            raise JobRequestNotFound(
+                f"Could not refresh {request}, ZooKeeper node is empty")
+
+        content = self._bytesToDict(data)
+
+        request.updateFromDict(content)
+        request._zstat = zstat
+
    def remove(self, request):
        self.log.debug("Removing request %s", request)
        try:
@ -340,7 +357,7 @@ class JobRequestQueue(ZooKeeperSimpleBase):
        have_lock = False
        lock = None
        try:
-            lock = Lock(self.kazoo_client, path)
+            lock = SessionAwareLock(self.kazoo_client, path)
            have_lock = lock.acquire(blocking, timeout)
        except LockTimeout:
            have_lock = False
@ -354,20 +371,14 @@ class JobRequestQueue(ZooKeeperSimpleBase):
            return False

        if not self.kazoo_client.exists(request.path):
-            lock.release()
-            self.log.error(
-                "Request not found for locking: %s", request.uuid
-            )
+            self._releaseLock(request, lock)
+            return False

-            # We may have just re-created the lock parent node just after the
-            # scheduler deleted it; therefore we should (re-) delete it.
+        # Update the request to ensure that we operate on the newest data.
        try:
-                # Delete the lock parent node as well.
-                path = "/".join([self.LOCK_ROOT, request.uuid])
-                self.kazoo_client.delete(path, recursive=True)
-            except NoNodeError:
-                pass
-
+            self.refresh(request)
+        except JobRequestNotFound:
+            self._releaseLock(request, lock)
            return False

        request.lock = lock
@ -380,6 +391,24 @@ class JobRequestQueue(ZooKeeperSimpleBase):

        return True

+    def _releaseLock(self, request, lock):
+        """Releases a lock.
+
+        This is used directly after acquiring the lock in case something went
+        wrong.
+        """
+        lock.release()
+        self.log.error("Request not found for locking: %s", request.uuid)
+
+        # We may have just re-created the lock parent node just after the
+        # scheduler deleted it; therefore we should (re-) delete it.
+        try:
+            # Delete the lock parent node as well.
+            path = "/".join([self.LOCK_ROOT, request.uuid])
+            self.kazoo_client.delete(path, recursive=True)
+        except NoNodeError:
+            pass
+
    def unlock(self, request):
        if request.lock is None:
            self.log.warning(
@ -391,7 +420,7 @@ class JobRequestQueue(ZooKeeperSimpleBase):

    def isLocked(self, request):
        path = "/".join([self.LOCK_ROOT, request.uuid])
-        lock = Lock(self.kazoo_client, path)
+        lock = SessionAwareLock(self.kazoo_client, path)
        is_locked = len(lock.contenders()) > 0
        return is_locked

--- a/zuul/zk/locks.py
+++ b/zuul/zk/locks.py
@ -16,13 +16,50 @@ import logging
 from contextlib import contextmanager
 from urllib.parse import quote_plus

+from kazoo.protocol.states import KazooState
+
 from zuul.zk.exceptions import LockException
-from zuul.zk.vendor.lock import ReadLock, WriteLock
+from zuul.zk.vendor.lock import Lock, ReadLock, WriteLock

 LOCK_ROOT = "/zuul/locks"
 TENANT_LOCK_ROOT = f"{LOCK_ROOT}/tenant"


+class SessionAwareLock(Lock):
+    def __init__(self, client, path, identifier=None, extra_lock_patterns=()):
+        self._zuul_ephemeral = None
+        self._zuul_session_expired = False
+        self._zuul_watching_session = False
+        super().__init__(client, path, identifier, extra_lock_patterns)
+
+    def acquire(self, blocking=True, timeout=None, ephemeral=True):
+        ret = super().acquire(blocking, timeout, ephemeral)
+        self._zuul_session_expired = False
+        if ret and ephemeral:
+            self._zuul_ephemeral = ephemeral
+            self.client.add_listener(self._zuul_session_watcher)
+            self._zuul_watching_session = True
+        return ret
+
+    def release(self):
+        if self._zuul_watching_session:
+            self.client.remove_listener(self._zuul_session_watcher)
+            self._zuul_watching_session = False
+        return super().release()
+
+    def _zuul_session_watcher(self, state):
+        if state == KazooState.LOST:
+            self._zuul_session_expired = True
+
+            # Return true to de-register
+            return True
+
+    def is_still_valid(self):
+        if not self._zuul_ephemeral:
+            return True
+        return not self._zuul_session_expired
+
+
@contextmanager
 def locked(lock, blocking=True, timeout=None):
    if not lock.acquire(blocking=blocking, timeout=timeout):