Files
zuul/zuul/zk/locks.py
James E. Blair 0dfb4752ee Retry lock cleanup
If one actor (thread, scheduler, etc) is holding a lock, and another
attempts a non-blocking acquisition (eg, a pipeline processing lock),
the second actor can encounter the following sequence:

1) create lock contender
2) see that lock contender did not win the election
3) connection transitions to suspended
4) attempt to delete lock contender

Steps 1 and 2 happen in a kazoo retry handler, so if the connection
is suspended, the attempt will retry.  Step 4 is not performed in
a retry handler; in fact the implementation method is named
"_best_effort_cleanup".

Making a best effort at cleanup would be okay for a blocking acquisition
with no timeout since we would always succeed, but it is not enough for
a non-blocking acquisition.

We must delete our lock contender if the connection is suspended (not
expired) because if it resumes, our lock contender will later win the
election, after it is orphaned.  If the connection does expire, then
it doesn't matter, the ephemeral lock would be released.

The current best effort method also suppresses the KazooException, so
when this does happen, there is no record of it in our logs, making
this difficult to identify.

To correct the behavior, we will retry the lock cleanup indefinitely,
so we can be assured that even if the connection is suspended, we will
delete our lock contender.

See also https://github.com/python-zk/kazoo/issues/732 and
https://github.com/python-zk/kazoo/pull/760

Change-Id: Ieb970373735557bfccdf20fee27102de660052de
2025-02-26 14:06:44 -08:00

234 lines
7.4 KiB
Python

# Copyright 2021 BMW Group
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import logging
from contextlib import contextmanager
from urllib.parse import quote_plus
from kazoo.exceptions import NoNodeError
from kazoo.protocol.states import KazooState
from kazoo.recipe.lock import Lock, ReadLock, WriteLock
from zuul.zk.exceptions import LockException
LOCK_ROOT = "/zuul/locks"
TENANT_LOCK_ROOT = f"{LOCK_ROOT}/tenant"
CONNECTION_LOCK_ROOT = f"{LOCK_ROOT}/connection"
class SessionAwareMixin:
def __init__(self, client, path, identifier=None, extra_lock_patterns=(),
ensure_path=True):
self._zuul_ephemeral = None
self._zuul_session_expired = False
self._zuul_watching_session = False
self._zuul_seen_contenders = set()
self._zuul_seen_contender_names = set()
self._zuul_contender_watch = None
self._zuul_ensure_path = ensure_path
super().__init__(client, path, identifier, extra_lock_patterns)
def _ensure_path(self):
# Override
if self._zuul_ensure_path:
return super()._ensure_path()
def acquire(self, blocking=True, timeout=None, ephemeral=True):
ret = super().acquire(blocking, timeout, ephemeral)
self._zuul_session_expired = False
if ret and ephemeral:
self._zuul_ephemeral = ephemeral
self.client.add_listener(self._zuul_session_watcher)
self._zuul_watching_session = True
return ret
def release(self):
if self._zuul_watching_session:
self.client.remove_listener(self._zuul_session_watcher)
self._zuul_watching_session = False
return super().release()
def _zuul_session_watcher(self, state):
if state == KazooState.LOST:
self._zuul_session_expired = True
# Return true to de-register
return True
def is_still_valid(self):
if not self._zuul_ephemeral:
return True
return not self._zuul_session_expired
def watch_for_contenders(self):
if not self.is_acquired:
raise Exception("Unable to set contender watch without lock")
self._zuul_contender_watch = self.client.ChildrenWatch(
self.path,
self._zuul_event_watch, send_event=True)
def _zuul_event_watch(self, children, event=None):
if not self.is_acquired:
# Stop watching
return False
if children:
for child in children:
if child in self._zuul_seen_contenders:
continue
self._zuul_seen_contenders.add(child)
try:
data, stat = self.client.get(self.path + "/" + child)
if data is not None:
data = data.decode("utf-8")
self._zuul_seen_contender_names.add(data)
except NoNodeError:
pass
return True
def contender_present(self, name):
if self._zuul_contender_watch is None:
raise Exception("Watch not started")
return name in self._zuul_seen_contender_names
# This is a kazoo recipe bugfix included here for convenience, but
# otherwise is not required for the main purpose of this class.
# https://github.com/python-zk/kazoo/issues/732
def _best_effort_cleanup(self):
self._retry(
self._inner_best_effort_cleanup,
)
def _inner_best_effort_cleanup(self):
node = self.node or self._find_node()
if node:
try:
self._delete_node(node)
except NoNodeError:
pass
class SessionAwareLock(SessionAwareMixin, Lock):
pass
class SessionAwareWriteLock(SessionAwareMixin, WriteLock):
pass
class SessionAwareReadLock(SessionAwareMixin, ReadLock):
pass
@contextmanager
def locked(lock, blocking=True, timeout=None):
try:
if not lock.acquire(blocking=blocking, timeout=timeout):
raise LockException(f"Failed to acquire lock {lock}")
except NoNodeError:
# If we encounter a NoNodeError when locking, try one more
# time in case we're racing the cleanup of a lock directory.
lock.assured_path = False
if not lock.acquire(blocking=blocking, timeout=timeout):
raise LockException(f"Failed to acquire lock {lock}")
try:
yield lock
finally:
try:
lock.release()
except Exception:
log = logging.getLogger("zuul.zk.locks")
log.exception("Failed to release lock %s", lock)
@contextmanager
def tenant_read_lock(client, tenant_name, log=None, blocking=True):
safe_tenant = quote_plus(tenant_name)
if blocking and log:
log.debug("Wait for %s read tenant lock", tenant_name)
with locked(
SessionAwareReadLock(
client.client,
f"{TENANT_LOCK_ROOT}/{safe_tenant}"),
blocking=blocking
) as lock:
try:
if log:
log.debug("Aquired %s read tenant lock", tenant_name)
yield lock
finally:
if log:
log.debug("Released %s read tenant lock", tenant_name)
@contextmanager
def tenant_write_lock(client, tenant_name, log=None, blocking=True,
identifier=None):
safe_tenant = quote_plus(tenant_name)
if blocking and log:
log.debug("Wait for %s write tenant lock (id: %s)",
tenant_name, identifier)
with locked(
SessionAwareWriteLock(
client.client,
f"{TENANT_LOCK_ROOT}/{safe_tenant}",
identifier=identifier),
blocking=blocking,
) as lock:
try:
if log:
log.debug("Aquired %s write tenant lock (id: %s)",
tenant_name, identifier)
yield lock
finally:
if log:
log.debug("Released %s write tenant lock (id: %s)",
tenant_name, identifier)
@contextmanager
def pipeline_lock(client, tenant_name, pipeline_name, blocking=True):
safe_tenant = quote_plus(tenant_name)
safe_pipeline = quote_plus(pipeline_name)
with locked(
SessionAwareLock(
client.client,
f"/zuul/locks/pipeline/{safe_tenant}/{safe_pipeline}"),
blocking=blocking
) as lock:
yield lock
@contextmanager
def management_queue_lock(client, tenant_name, blocking=True):
safe_tenant = quote_plus(tenant_name)
with locked(
SessionAwareLock(
client.client,
f"/zuul/locks/events/management/{safe_tenant}"),
blocking=blocking
) as lock:
yield lock
@contextmanager
def trigger_queue_lock(client, tenant_name, blocking=True):
safe_tenant = quote_plus(tenant_name)
with locked(
SessionAwareLock(
client.client,
f"/zuul/locks/events/trigger/{safe_tenant}"),
blocking=blocking
) as lock:
yield lock