Files
deb-python-taskflow/taskflow/jobs/backends/impl_zookeeper.py
Joshua Harlow f8d69ffc31 Adjust on_job_posting to not hold the lock while investigating
To avoid issues when investigating jobs in one thread and having
a dispatcher thread get locked on that same investigation lock avoid
this entirely by not holding onto that lock.

This also adds a small cleanup around the notifier thread pool and
ensures that it actually exists in a useable state (and handles the
exceptions that arise when it doesn't) on submission.

Change-Id: I12e71f029726ec54ec0aadbdf71cc3c7a959f047
2014-09-27 14:51:38 -07:00

709 lines
28 KiB
Python

# -*- coding: utf-8 -*-
# Copyright (C) 2013 Yahoo! Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import collections
import contextlib
import functools
import logging
import threading
from concurrent import futures
from kazoo import exceptions as k_exceptions
from kazoo.protocol import paths as k_paths
from kazoo.recipe import watchers
from oslo.serialization import jsonutils
from oslo.utils import excutils
import six
from taskflow import exceptions as excp
from taskflow.jobs import job as base_job
from taskflow.jobs import jobboard
from taskflow.openstack.common import uuidutils
from taskflow import states
from taskflow.types import timing as tt
from taskflow.utils import kazoo_utils
from taskflow.utils import lock_utils
from taskflow.utils import misc
LOG = logging.getLogger(__name__)
UNCLAIMED_JOB_STATES = (
states.UNCLAIMED,
)
ALL_JOB_STATES = (
states.UNCLAIMED,
states.COMPLETE,
states.CLAIMED,
)
# Transaction support was added in 3.4.0
MIN_ZK_VERSION = (3, 4, 0)
LOCK_POSTFIX = ".lock"
JOB_PREFIX = 'job'
def _check_who(who):
if not isinstance(who, six.string_types):
raise TypeError("Job applicant must be a string type")
if len(who) == 0:
raise ValueError("Job applicant must be non-empty")
class ZookeeperJob(base_job.Job):
def __init__(self, name, board, client, backend, path,
uuid=None, details=None, book=None, book_data=None,
created_on=None):
super(ZookeeperJob, self).__init__(name, uuid=uuid, details=details)
self._board = board
self._book = book
if not book_data:
book_data = {}
self._book_data = book_data
self._client = client
self._backend = backend
if all((self._book, self._book_data)):
raise ValueError("Only one of 'book_data' or 'book'"
" can be provided")
self._path = path
self._lock_path = path + LOCK_POSTFIX
self._created_on = created_on
self._node_not_found = False
@property
def lock_path(self):
return self._lock_path
@property
def path(self):
return self._path
def _get_node_attr(self, path, attr_name, trans_func=None):
try:
_data, node_stat = self._client.get(path)
attr = getattr(node_stat, attr_name)
if trans_func is not None:
return trans_func(attr)
else:
return attr
except k_exceptions.NoNodeError as e:
raise excp.NotFound("Can not fetch the %r attribute"
" of job %s (%s), path %s not found"
% (attr_name, self.uuid, self.path, path), e)
except self._client.handler.timeout_exception as e:
raise excp.JobFailure("Can not fetch the %r attribute"
" of job %s (%s), connection timed out"
% (attr_name, self.uuid, self.path), e)
except k_exceptions.SessionExpiredError as e:
raise excp.JobFailure("Can not fetch the %r attribute"
" of job %s (%s), session expired"
% (attr_name, self.uuid, self.path), e)
except (AttributeError, k_exceptions.KazooException) as e:
raise excp.JobFailure("Can not fetch the %r attribute"
" of job %s (%s), internal error" %
(attr_name, self.uuid, self.path), e)
@property
def last_modified(self):
modified_on = None
try:
if not self._node_not_found:
modified_on = self._get_node_attr(
self.path, 'mtime',
trans_func=misc.millis_to_datetime)
except excp.NotFound:
self._node_not_found = True
return modified_on
@property
def created_on(self):
# This one we can cache (since it won't change after creation).
if self._node_not_found:
return None
if self._created_on is None:
try:
self._created_on = self._get_node_attr(
self.path, 'ctime',
trans_func=misc.millis_to_datetime)
except excp.NotFound:
self._node_not_found = True
return self._created_on
@property
def board(self):
return self._board
def _load_book(self):
book_uuid = self.book_uuid
if self._backend is not None and book_uuid is not None:
# TODO(harlowja): we are currently limited by assuming that the
# job posted has the same backend as this loader (to start this
# seems to be a ok assumption, and can be adjusted in the future
# if we determine there is a use-case for multi-backend loaders,
# aka a registry of loaders).
with contextlib.closing(self._backend.get_connection()) as conn:
return conn.get_logbook(book_uuid)
# No backend to fetch from or no uuid specified
return None
@property
def state(self):
owner = self.board.find_owner(self)
job_data = {}
try:
raw_data, _data_stat = self._client.get(self.path)
job_data = misc.decode_json(raw_data)
except k_exceptions.NoNodeError:
pass
except k_exceptions.SessionExpiredError as e:
raise excp.JobFailure("Can not fetch the state of %s,"
" session expired" % (self.uuid), e)
except self._client.handler.timeout_exception as e:
raise excp.JobFailure("Can not fetch the state of %s,"
" connection timed out" % (self.uuid), e)
except k_exceptions.KazooException as e:
raise excp.JobFailure("Can not fetch the state of %s, internal"
" error" % (self.uuid), e)
if not job_data:
# No data this job has been completed (the owner that we might have
# fetched will not be able to be fetched again, since the job node
# is a parent node of the owner/lock node).
return states.COMPLETE
if not owner:
# No owner, but data, still work to be done.
return states.UNCLAIMED
return states.CLAIMED
def __cmp__(self, other):
return cmp(self.path, other.path)
def __hash__(self):
return hash(self.path)
@property
def book(self):
if self._book is None:
self._book = self._load_book()
return self._book
@property
def book_uuid(self):
if self._book:
return self._book.uuid
if self._book_data:
return self._book_data.get('uuid')
else:
return None
@property
def book_name(self):
if self._book:
return self._book.name
if self._book_data:
return self._book_data.get('name')
else:
return None
class ZookeeperJobBoardIterator(six.Iterator):
"""Iterator over a zookeeper jobboard.
It supports the following attributes/constructor arguments:
* ensure_fresh: boolean that requests that during every fetch of a new
set of jobs this will cause the iterator to force the backend to
refresh (ensuring that the jobboard has the most recent job listings).
* only_unclaimed: boolean that indicates whether to only iterate
over unclaimed jobs.
"""
def __init__(self, board, only_unclaimed=False, ensure_fresh=False):
self._board = board
self._jobs = collections.deque()
self._fetched = False
self.ensure_fresh = ensure_fresh
self.only_unclaimed = only_unclaimed
@property
def board(self):
return self._board
def __iter__(self):
return self
def _next_job(self):
if self.only_unclaimed:
allowed_states = UNCLAIMED_JOB_STATES
else:
allowed_states = ALL_JOB_STATES
job = None
while self._jobs and job is None:
maybe_job = self._jobs.popleft()
try:
if maybe_job.state in allowed_states:
job = maybe_job
except excp.JobFailure:
LOG.warn("Failed determining the state of job: %s (%s)",
maybe_job.uuid, maybe_job.path, exc_info=True)
except excp.NotFound:
self._board._remove_job(maybe_job.path)
return job
def __next__(self):
if not self._jobs:
if not self._fetched:
jobs = self._board._fetch_jobs(ensure_fresh=self.ensure_fresh)
self._jobs.extend(jobs)
self._fetched = True
job = self._next_job()
if job is None:
raise StopIteration
else:
return job
class ZookeeperJobBoard(jobboard.NotifyingJobBoard):
def __init__(self, name, conf,
client=None, persistence=None, emit_notifications=True):
super(ZookeeperJobBoard, self).__init__(name, conf)
if client is not None:
self._client = client
self._owned = False
else:
self._client = kazoo_utils.make_client(self._conf)
self._owned = True
path = str(conf.get("path", "/taskflow/jobs"))
if not path:
raise ValueError("Empty zookeeper path is disallowed")
if not k_paths.isabs(path):
raise ValueError("Zookeeper path must be absolute")
self._path = path
# The backend to load the full logbooks from, since whats sent over
# the zookeeper data connection is only the logbook uuid and name, and
# not currently the full logbook (later when a zookeeper backend
# appears we can likely optimize for that backend usage by directly
# reading from the path where the data is stored, if we want).
self._persistence = persistence
# Misc. internal details
self._known_jobs = {}
self._job_lock = threading.RLock()
self._job_cond = threading.Condition(self._job_lock)
self._open_close_lock = threading.RLock()
self._client.add_listener(self._state_change_listener)
self._bad_paths = frozenset([path])
self._job_watcher = None
# Since we use sequenced ids this will be the path that the sequences
# are prefixed with, for example, job0000000001, job0000000002, ...
self._job_base = k_paths.join(path, JOB_PREFIX)
self._worker = None
self._emit_notifications = bool(emit_notifications)
def _emit(self, state, details):
# Submit the work to the executor to avoid blocking the kazoo queue.
try:
self._worker.submit(self.notifier.notify, state, details)
except (AttributeError, RuntimeError):
# Notification thread is shutdown or non-existent, either case we
# just want to skip submitting a notification...
pass
@property
def path(self):
return self._path
@property
def job_count(self):
with self._job_lock:
return len(self._known_jobs)
def _fetch_jobs(self, ensure_fresh=False):
if ensure_fresh:
self._force_refresh()
with self._job_lock:
return sorted(six.itervalues(self._known_jobs))
def _force_refresh(self):
try:
children = self._client.get_children(self.path)
except self._client.handler.timeout_exception as e:
raise excp.JobFailure("Refreshing failure, connection timed out",
e)
except k_exceptions.SessionExpiredError as e:
raise excp.JobFailure("Refreshing failure, session expired", e)
except k_exceptions.NoNodeError:
pass
except k_exceptions.KazooException as e:
raise excp.JobFailure("Refreshing failure, internal error", e)
else:
self._on_job_posting(children, delayed=False)
def iterjobs(self, only_unclaimed=False, ensure_fresh=False):
return ZookeeperJobBoardIterator(self,
only_unclaimed=only_unclaimed,
ensure_fresh=ensure_fresh)
def _remove_job(self, path):
LOG.debug("Removing job that was at path: %s", path)
with self._job_lock:
job = self._known_jobs.pop(path, None)
if job is not None:
self._emit(jobboard.REMOVAL, details={'job': job})
def _process_child(self, path, request):
"""Receives the result of a child data fetch request."""
job = None
try:
raw_data, node_stat = request.get()
job_data = misc.decode_json(raw_data)
created_on = misc.millis_to_datetime(node_stat.ctime)
except (ValueError, TypeError, KeyError):
LOG.warn("Incorrectly formatted job data found at path: %s",
path, exc_info=True)
except self._client.handler.timeout_exception:
LOG.warn("Connection timed out fetching job data from path: %s",
path, exc_info=True)
except k_exceptions.SessionExpiredError:
LOG.warn("Session expired fetching job data from path: %s", path,
exc_info=True)
except k_exceptions.NoNodeError:
LOG.debug("No job node found at path: %s, it must have"
" disappeared or was removed", path)
except k_exceptions.KazooException:
LOG.warn("Internal error fetching job data from path: %s",
path, exc_info=True)
else:
self._job_cond.acquire()
try:
# Now we can offically check if someone already placed this
# jobs information into the known job set (if it's already
# existing then just leave it alone).
if path not in self._known_jobs:
job = ZookeeperJob(job_data['name'], self,
self._client, self._persistence, path,
uuid=job_data['uuid'],
book_data=job_data.get("book"),
details=job_data.get("details", {}),
created_on=created_on)
self._known_jobs[path] = job
self._job_cond.notify_all()
finally:
self._job_cond.release()
if job is not None:
self._emit(jobboard.POSTED, details={'job': job})
def _on_job_posting(self, children, delayed=True):
LOG.debug("Got children %s under path %s", children, self.path)
child_paths = []
for c in children:
if c.endswith(LOCK_POSTFIX) or not c.startswith(JOB_PREFIX):
# Skip lock paths or non-job-paths (these are not valid jobs)
continue
child_paths.append(k_paths.join(self.path, c))
# Figure out what we really should be investigating and what we
# shouldn't...
investigate_paths = []
with self._job_lock:
removals = set()
for path in six.iterkeys(self._known_jobs):
if path not in child_paths:
removals.add(path)
for path in removals:
self._remove_job(path)
for path in child_paths:
if path in self._bad_paths:
continue
# This pre-check will not guarantee that we will not already
# have the job (if it's being populated elsewhere) but it will
# reduce the amount of duplicated requests in general.
if path in self._known_jobs:
continue
if path not in investigate_paths:
investigate_paths.append(path)
for path in investigate_paths:
# Fire off the request to populate this job.
#
# This method is *usually* called from a asynchronous handler so
# it's better to exit from this quickly to allow other asynchronous
# handlers to be executed.
request = self._client.get_async(path)
if delayed:
request.rawlink(functools.partial(self._process_child, path))
else:
self._process_child(path, request)
def post(self, name, book=None, details=None):
def format_posting(job_uuid):
posting = {
'uuid': job_uuid,
'name': name,
}
if details:
posting['details'] = details
else:
posting['details'] = {}
if book is not None:
posting['book'] = {
'name': book.name,
'uuid': book.uuid,
}
return posting
# NOTE(harlowja): Jobs are not ephemeral, they will persist until they
# are consumed (this may change later, but seems safer to do this until
# further notice).
job_uuid = uuidutils.generate_uuid()
with self._wrap(job_uuid, None,
"Posting failure: %s", ensure_known=False):
job_posting = format_posting(job_uuid)
job_posting = misc.binary_encode(jsonutils.dumps(job_posting))
job_path = self._client.create(self._job_base,
value=job_posting,
sequence=True,
ephemeral=False)
job = ZookeeperJob(name, self, self._client,
self._persistence, job_path,
book=book, details=details,
uuid=job_uuid)
self._job_cond.acquire()
try:
self._known_jobs[job_path] = job
self._job_cond.notify_all()
finally:
self._job_cond.release()
self._emit(jobboard.POSTED, details={'job': job})
return job
def claim(self, job, who):
def _unclaimable_try_find_owner(cause):
try:
owner = self.find_owner(job)
except Exception:
owner = None
if owner:
msg = "Job %s already claimed by '%s'" % (job.uuid, owner)
else:
msg = "Job %s already claimed" % (job.uuid)
return excp.UnclaimableJob(msg, cause)
_check_who(who)
with self._wrap(job.uuid, job.path, "Claiming failure: %s"):
# NOTE(harlowja): post as json which will allow for future changes
# more easily than a raw string/text.
value = jsonutils.dumps({
'owner': who,
})
# Ensure the target job is still existent (at the right version).
job_data, job_stat = self._client.get(job.path)
txn = self._client.transaction()
# This will abort (and not create the lock) if the job has been
# removed (somehow...) or updated by someone else to a different
# version...
txn.check(job.path, version=job_stat.version)
txn.create(job.lock_path, value=misc.binary_encode(value),
ephemeral=True)
try:
kazoo_utils.checked_commit(txn)
except k_exceptions.NodeExistsError as e:
raise _unclaimable_try_find_owner(e)
except kazoo_utils.KazooTransactionException as e:
if len(e.failures) < 2:
raise
else:
if isinstance(e.failures[0], k_exceptions.NoNodeError):
raise excp.NotFound(
"Job %s not found to be claimed" % job.uuid,
e.failures[0])
if isinstance(e.failures[1], k_exceptions.NodeExistsError):
raise _unclaimable_try_find_owner(e.failures[1])
else:
raise excp.UnclaimableJob(
"Job %s claim failed due to transaction"
" not succeeding" % (job.uuid), e)
@contextlib.contextmanager
def _wrap(self, job_uuid, job_path,
fail_msg_tpl="Failure: %s", ensure_known=True):
if job_path:
fail_msg_tpl += " (%s)" % (job_path)
if ensure_known:
if not job_path:
raise ValueError("Unable to check if %r is a known path"
% (job_path))
with self._job_lock:
if job_path not in self._known_jobs:
fail_msg_tpl += ", unknown job"
raise excp.NotFound(fail_msg_tpl % (job_uuid))
try:
yield
except self._client.handler.timeout_exception as e:
fail_msg_tpl += ", connection timed out"
raise excp.JobFailure(fail_msg_tpl % (job_uuid), e)
except k_exceptions.SessionExpiredError as e:
fail_msg_tpl += ", session expired"
raise excp.JobFailure(fail_msg_tpl % (job_uuid), e)
except k_exceptions.NoNodeError:
fail_msg_tpl += ", unknown job"
raise excp.NotFound(fail_msg_tpl % (job_uuid))
except k_exceptions.KazooException as e:
fail_msg_tpl += ", internal error"
raise excp.JobFailure(fail_msg_tpl % (job_uuid), e)
def find_owner(self, job):
with self._wrap(job.uuid, job.path, "Owner query failure: %s"):
try:
self._client.sync(job.lock_path)
raw_data, _lock_stat = self._client.get(job.lock_path)
data = misc.decode_json(raw_data)
owner = data.get("owner")
except k_exceptions.NoNodeError:
owner = None
return owner
def _get_owner_and_data(self, job):
lock_data, lock_stat = self._client.get(job.lock_path)
job_data, job_stat = self._client.get(job.path)
return (misc.decode_json(lock_data), lock_stat,
misc.decode_json(job_data), job_stat)
def consume(self, job, who):
_check_who(who)
with self._wrap(job.uuid, job.path, "Consumption failure: %s"):
try:
owner_data = self._get_owner_and_data(job)
lock_data, lock_stat, data, data_stat = owner_data
except k_exceptions.NoNodeError:
raise excp.JobFailure("Can not consume a job %s"
" which we can not determine"
" the owner of" % (job.uuid))
if lock_data.get("owner") != who:
raise excp.JobFailure("Can not consume a job %s"
" which is not owned by %s"
% (job.uuid, who))
txn = self._client.transaction()
txn.delete(job.lock_path, version=lock_stat.version)
txn.delete(job.path, version=data_stat.version)
kazoo_utils.checked_commit(txn)
self._remove_job(job.path)
def abandon(self, job, who):
_check_who(who)
with self._wrap(job.uuid, job.path, "Abandonment failure: %s"):
try:
owner_data = self._get_owner_and_data(job)
lock_data, lock_stat, data, data_stat = owner_data
except k_exceptions.NoNodeError:
raise excp.JobFailure("Can not abandon a job %s"
" which we can not determine"
" the owner of" % (job.uuid))
if lock_data.get("owner") != who:
raise excp.JobFailure("Can not abandon a job %s"
" which is not owned by %s"
% (job.uuid, who))
txn = self._client.transaction()
txn.delete(job.lock_path, version=lock_stat.version)
kazoo_utils.checked_commit(txn)
def _state_change_listener(self, state):
LOG.debug("Kazoo client has changed to state: %s", state)
def wait(self, timeout=None):
# Wait until timeout expires (or forever) for jobs to appear.
watch = None
if timeout is not None:
watch = tt.StopWatch(duration=float(timeout)).start()
self._job_cond.acquire()
try:
while True:
if not self._known_jobs:
if watch is not None and watch.expired():
raise excp.NotFound("Expired waiting for jobs to"
" arrive; waited %s seconds"
% watch.elapsed())
# This is done since the given timeout can not be provided
# to the condition variable, since we can not ensure that
# when we acquire the condition that there will actually
# be jobs (especially if we are spuriously awaken), so we
# must recalculate the amount of time we really have left.
timeout = None
if watch is not None:
timeout = watch.leftover()
self._job_cond.wait(timeout)
else:
it = ZookeeperJobBoardIterator(self)
it._jobs.extend(self._fetch_jobs())
it._fetched = True
return it
finally:
self._job_cond.release()
@property
def connected(self):
return self._client.connected
@lock_utils.locked(lock='_open_close_lock')
def close(self):
if self._owned:
LOG.debug("Stopping client")
kazoo_utils.finalize_client(self._client)
if self._worker is not None:
LOG.debug("Shutting down the notifier")
self._worker.shutdown()
self._worker = None
with self._job_lock:
self._known_jobs.clear()
LOG.debug("Stopped & cleared local state")
@lock_utils.locked(lock='_open_close_lock')
def connect(self, timeout=10.0):
def try_clean():
# Attempt to do the needed cleanup if post-connection setup does
# not succeed (maybe the connection is lost right after it is
# obtained).
try:
self.close()
except k_exceptions.KazooException:
LOG.exception("Failed cleaning-up after post-connection"
" initialization failed")
try:
if timeout is not None:
timeout = float(timeout)
self._client.start(timeout=timeout)
except (self._client.handler.timeout_exception,
k_exceptions.KazooException) as e:
raise excp.JobFailure("Failed to connect to zookeeper", e)
try:
kazoo_utils.check_compatible(self._client, MIN_ZK_VERSION)
if self._worker is None and self._emit_notifications:
self._worker = futures.ThreadPoolExecutor(max_workers=1)
self._client.ensure_path(self.path)
if self._job_watcher is None:
self._job_watcher = watchers.ChildrenWatch(
self._client,
self.path,
func=self._on_job_posting,
allow_session_lost=True)
except excp.IncompatibleVersion:
with excutils.save_and_reraise_exception():
try_clean()
except (self._client.handler.timeout_exception,
k_exceptions.KazooException) as e:
try_clean()
raise excp.JobFailure("Failed to do post-connection"
" initialization", e)