From 1512db43ec29e9afa6d05e292cdc7cfce5da2029 Mon Sep 17 00:00:00 2001 From: kgriffs Date: Mon, 16 Sep 2013 18:19:14 -0500 Subject: [PATCH] fix(mongodb): Marker generation creates a bottleneck under heavy load This patch changes markers so that they are generated using a per-queue side counter. A heuristic is used to mitigate a race condition. Due to the new semantics, partial inserts are no longer possible due to collisions, which ended up simplifying the retry logic for posting messages. As a consequence of this patch, the last message posted no longer needs to remain in the queue indefinitely, rendering marconi-gc unnecessary, and so it has been removed. Also, since the mongod GC worker runs once a minute, the queries no longer filter out expired-but-not-yet-gc'd messages; on average, a message may live more than 30 seconds passed it's expected lifetime, but I do not think that this will harm or complicate any application building on top of Marconi, practically speaking. That being said, it is worth calling out in documentation. Closes-Bug: #1218602 Change-Id: I34e24e7dd7c4e017c84eb5929ce37ad4c9e5266a --- etc/marconi.conf-sample | 17 +- marconi/cmd/gc.py | 61 ---- marconi/queues/storage/mongodb/claims.py | 9 +- marconi/queues/storage/mongodb/driver.py | 27 +- marconi/queues/storage/mongodb/messages.py | 261 ++++++++---------- marconi/queues/storage/mongodb/queues.py | 109 +++++++- marconi/queues/storage/mongodb/utils.py | 23 -- marconi/tests/__init__.py | 6 +- marconi/tests/helpers.py | 31 +++ marconi/tests/queues/storage/base.py | 32 ++- setup.cfg | 1 - .../unit/queues/storage/test_impl_mongodb.py | 163 +++++++---- 12 files changed, 399 insertions(+), 341 deletions(-) delete mode 100644 marconi/cmd/gc.py diff --git a/etc/marconi.conf-sample b/etc/marconi.conf-sample index b70a2a546..e546e7036 100644 --- a/etc/marconi.conf-sample +++ b/etc/marconi.conf-sample @@ -8,7 +8,7 @@ # Log to this file! log_file = /var/log/marconi/server.log -;auth_strategy = +;auth_strategy = # ================= Syslog Options ============================ @@ -59,19 +59,6 @@ database = marconi # at the same instant. ;max_retry_jitter = 0.005 -# Frequency of message garbage collections, in seconds -;gc_interval = 300 - -# Threshold of number of expired messages to reach in a given -# queue, before performing the GC. Useful for reducing frequent -# locks on the DB for non-busy queues, or for worker queues -# which process jobs quickly enough to keep the number of in- -# flight messages low. -# -# Note: The higher this number, the larger the memory-mapped DB -# files will be. -;gc_threshold = 1000 - [drivers:proxy:storage:mongodb] uri = mongodb://localhost:27017 database = marconi_proxy @@ -102,4 +89,4 @@ database = marconi_proxy # caching mechanism [oslo_cache] cache_backend = memory -;cache_prefix = my_prefix \ No newline at end of file +;cache_prefix = my_prefix diff --git a/marconi/cmd/gc.py b/marconi/cmd/gc.py deleted file mode 100644 index 92f4e4633..000000000 --- a/marconi/cmd/gc.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2013 Rackspace Hosting, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -import random -import sys -import time - -from marconi.common import cli -from marconi.common import config -from marconi.openstack.common import log as logging -from marconi.queues import bootstrap - -PROJECT_CFG = config.project('marconi') -LOG = logging.getLogger(__name__) - - -@cli.runnable -def run(): - """Entry point to start marconi-gc. - - Operators should run 2-3 instances on different - boxes for fault-tolerance. - - Note: This call blocks until the process is killed - or interrupted. - """ - - try: - info = _(u'Starting marconi-gc') - print(info + _(u'. Use CTRL+C to exit...\n')) - LOG.info(info) - - boot = bootstrap.Bootstrap(cli_args=sys.argv[1:]) - storage_driver = boot.storage - gc_interval = storage_driver.gc_interval - - # NOTE(kgriffs): Don't want all garbage collector - # instances running at the same time (will peg the DB). - offset = random.random() * gc_interval - time.sleep(offset) - - while True: - storage_driver.gc() - time.sleep(gc_interval) - - except NotImplementedError as ex: - print(_(u'The configured storage driver does not support GC.\n')) - LOG.exception(ex) diff --git a/marconi/queues/storage/mongodb/claims.py b/marconi/queues/storage/mongodb/claims.py index 48220afd6..578c4ac42 100644 --- a/marconi/queues/storage/mongodb/claims.py +++ b/marconi/queues/storage/mongodb/claims.py @@ -21,6 +21,8 @@ Field Mappings: letter of their long name. """ +import datetime + from bson import objectid from marconi.common import config @@ -129,8 +131,9 @@ class ClaimController(storage.ClaimBase): now = timeutils.utcnow_ts() claim_expires = now + ttl - message_expires = claim_expires + grace message_ttl = ttl + grace + message_expiration = datetime.datetime.utcfromtimestamp( + claim_expires + grace) meta = { 'id': oid, @@ -171,10 +174,10 @@ class ClaimController(storage.ClaimBase): # This sets the expiration time to # `expires` on messages that would # expire before claim. - new_values = {'e': message_expires, 't': message_ttl} + new_values = {'e': message_expiration, 't': message_ttl} msg_ctrl._col.update({'q': queue, 'p': project, - 'e': {'$lt': message_expires}, + 'e': {'$lt': message_expiration}, 'c.id': oid}, {'$set': new_values}, upsert=False, multi=True) diff --git a/marconi/queues/storage/mongodb/driver.py b/marconi/queues/storage/mongodb/driver.py index f7f56a6f7..1456c8d7a 100644 --- a/marconi/queues/storage/mongodb/driver.py +++ b/marconi/queues/storage/mongodb/driver.py @@ -31,28 +31,29 @@ class Driver(storage.DriverBase): def __init__(self): # Lazy instantiation self._database = None + self._connection = None + + def _connect(self): + if options.CFG.uri and 'replicaSet' in options.CFG.uri: + self._connection = pymongo.MongoReplicaSetClient(options.CFG.uri) + else: + self._connection = pymongo.MongoClient(options.CFG.uri) @property def db(self): """Property for lazy instantiation of mongodb's database.""" if self._database is None: - if options.CFG.uri and 'replicaSet' in options.CFG.uri: - conn = pymongo.MongoReplicaSetClient(options.CFG.uri) - else: - conn = pymongo.MongoClient(options.CFG.uri) - - self._database = conn[options.CFG.database] + self._database = self.connection[options.CFG.database] return self._database - def gc(self): - LOG.info(_(u'Performing garbage collection.')) + @property + def connection(self): + """Property for lazy instantiation of mongodb's client connection.""" + if self._connection is None: + self._connect() - try: - self.message_controller.remove_expired() - except pymongo.errors.ConnectionFailure as ex: - # Better luck next time... - LOG.exception(ex) + return self._connection @property def gc_interval(self): diff --git a/marconi/queues/storage/mongodb/messages.py b/marconi/queues/storage/mongodb/messages.py index 0d50a9949..ca08d06be 100644 --- a/marconi/queues/storage/mongodb/messages.py +++ b/marconi/queues/storage/mongodb/messages.py @@ -21,6 +21,7 @@ Field Mappings: letter of their long name. """ +import datetime import time import pymongo.errors @@ -39,9 +40,25 @@ CFG = config.namespace('limits:storage').from_options( default_message_paging=10, ) +# NOTE(kgriffs): This value, in seconds, should be at least less than the +# minimum allowed TTL for messages (60 seconds). Make it 45 to allow for +# some fudge room. +MAX_RETRY_POST_DURATION = 45 + +# NOTE(kgriffs): It is extremely unlikely that all workers would somehow hang +# for more than 5 seconds, without a single one being able to succeed in +# posting some messages and incrementing the counter, thus allowing the other +# producers to succeed in turn. +COUNTER_STALL_WINDOW = 5 + # For hinting ID_INDEX_FIELDS = [('_id', 1)] +# For removing expired messages +TTL_INDEX_FIELDS = [ + ('e', 1) +] + # NOTE(kgriffs): This index is for listing messages, usually # filtering out claimed ones. ACTIVE_INDEX_FIELDS = [ @@ -67,11 +84,11 @@ CLAIMED_INDEX_FIELDS = [ ('c.e', 1), ] -# Index used for _next_marker() and also to ensure uniqueness. +# Index used to ensure uniqueness. MARKER_INDEX_FIELDS = [ ('p', 1), ('q', 1), - ('k', -1) + ('k', 1) ] @@ -94,7 +111,7 @@ class MessageController(storage.MessageBase): # Cache for convenience and performance (avoids extra lookups and # recreating the range for every request.) - self._queue_controller = self.driver.queue_controller + self._queue_ctrl = self.driver.queue_controller self._db = self.driver.db self._retry_range = range(options.CFG.max_attempts) @@ -111,6 +128,11 @@ class MessageController(storage.MessageBase): def _ensure_indexes(self): """Ensures that all indexes are created.""" + self._col.ensure_index(TTL_INDEX_FIELDS, + name='ttl', + expireAfterSeconds=0, + background=True) + self._col.ensure_index(ACTIVE_INDEX_FIELDS, name='active', background=True) @@ -135,36 +157,6 @@ class MessageController(storage.MessageBase): unique=True, background=True) - def _next_marker(self, queue_name, project=None): - """Retrieves the next message marker for a given queue. - - This helper is used to generate monotonic pagination - markers that are saved as part of the message - document. Simply taking the max of the current message - markers works, since Marconi always leaves the most recent - message in the queue (new queues always return 1). - - Note 1: Markers are scoped per-queue and so are *not* - globally unique or globally ordered. - - Note 2: If two or more requests to this method are made - in parallel, this method will return the same - marker. This is done intentionally so that the caller - can detect a parallel message post, allowing it to - mitigate race conditions between producer and - observer clients. - - :param queue_name: Determines the scope for the marker - :param project: Queue's project - :returns: next message marker as an integer - """ - - document = self._col.find_one({'p': project, 'q': queue_name}, - sort=[('k', -1)], - fields={'k': 1, '_id': 0}) - - return 1 if document is None else (document['k'] + 1) - def _backoff_sleep(self, attempt): """Sleep between retries using a jitter algorithm. @@ -180,38 +172,6 @@ class MessageController(storage.MessageBase): time.sleep(seconds) - def _remove_expired(self, queue_name, project): - """Removes all expired messages except for the most recent - in each queue. - - This method is used in lieu of mongo's TTL index since we - must always leave at least one message in the queue for - calculating the next marker. - - :param queue_name: name for the queue from which to remove - expired messages - :param project: Project queue_name belong's too - """ - - # Get the message with the highest marker, and leave - # it in the queue - head = self._col.find_one({'p': project, 'q': queue_name}, - sort=[('k', -1)], fields={'k': 1}) - - if head is None: - # Assume queue was just deleted via a parallel request - LOG.debug(_(u'Queue %s is empty or missing.') % queue_name) - return - - query = { - 'p': project, - 'q': queue_name, - 'k': {'$ne': head['k']}, - 'e': {'$lte': timeutils.utcnow_ts()}, - } - - self._col.remove(query, w=0) - def _purge_queue(self, queue_name, project=None): """Removes all messages from the queue. @@ -267,9 +227,6 @@ class MessageController(storage.MessageBase): # queue and project 'p': project, 'q': queue_name, - - # The messages cannot be expired - 'e': {'$gt': now}, } if not echo: @@ -309,9 +266,6 @@ class MessageController(storage.MessageBase): # Messages must belong to this queue 'p': project, 'q': queue_name, - - # The messages can not be expired - 'e': {'$gt': timeutils.utcnow_ts()}, } if not include_claimed: @@ -399,31 +353,6 @@ class MessageController(storage.MessageBase): {'$set': {'c': {'id': None, 'e': now}}}, upsert=False, multi=True) - def remove_expired(self): - """Removes all expired messages except for the most recent - in each queue. - - This method is used in lieu of mongo's TTL index since we - must always leave at least one message in the queue for - calculating the next marker. - - Warning: This method is expensive, since it must perform - separate queries for each queue, due to the requirement that - it must leave at least one message in each queue, and it - is impractical to send a huge list of _id's to filter out - in a single call. That being said, this is somewhat mitigated - by the fact that remove() is run on each queue seperately, - thereby reducing the duration that any given lock is held. - """ - - # TODO(kgriffs): Optimize first by batching the .removes, second - # by setting a 'last inserted ID' in the queue collection for - # each message inserted (TBD, may cause problematic side-effect), - # and third, by changing the marker algorithm such that it no - # longer depends on retaining the last message in the queue! - for name, project in self._queue_controller._get_np(): - self._remove_expired(name, project) - def list(self, queue_name, project=None, marker=None, limit=None, echo=False, client_uuid=None, include_claimed=False): @@ -469,7 +398,6 @@ class MessageController(storage.MessageBase): '_id': mid, 'p': project, 'q': queue_name, - 'e': {'$gt': now} } message = list(self._col.find(query).limit(1).hint(ID_INDEX_FIELDS)) @@ -493,7 +421,6 @@ class MessageController(storage.MessageBase): '_id': {'$in': message_ids}, 'p': project, 'q': queue_name, - 'e': {'$gt': now}, } # NOTE(flaper87): Should this query @@ -508,19 +435,20 @@ class MessageController(storage.MessageBase): @utils.raises_conn_error def post(self, queue_name, messages, client_uuid, project=None): now = timeutils.utcnow_ts() + now_dt = datetime.datetime.utcfromtimestamp(now) - if not self._queue_controller.exists(queue_name, project): + if not self._queue_ctrl.exists(queue_name, project): raise exceptions.QueueDoesNotExist(queue_name, project) # Set the next basis marker for the first attempt. - next_marker = self._next_marker(queue_name, project) + next_marker = self._queue_ctrl._get_counter(queue_name, project) prepared_messages = [ { 't': message['ttl'], 'q': queue_name, 'p': project, - 'e': now + message['ttl'], + 'e': now_dt + datetime.timedelta(seconds=message['ttl']), 'u': client_uuid, 'c': {'id': None, 'e': now}, 'b': message['body'] if 'body' in message else {}, @@ -530,38 +458,47 @@ class MessageController(storage.MessageBase): for index, message in enumerate(messages) ] - # Results are aggregated across all attempts - # NOTE(kgriffs): Using lazy instantiation... - aggregated_results = None - # Use a retry range for sanity, although we expect # to rarely, if ever, reach the maximum number of # retries. + # + # NOTE(kgriffs): With the default configuration (100 ms + # max sleep, 1000 max attempts), the max stall time + # before the operation is abandoned is 49.95 seconds. for attempt in self._retry_range: try: ids = self._col.insert(prepared_messages) - # NOTE(kgriffs): Only use aggregated results if we must, - # which saves some cycles on the happy path. - if aggregated_results: - aggregated_results.extend(ids) - ids = aggregated_results - # Log a message if we retried, for debugging perf issues if attempt != 0: message = _(u'%(attempts)d attempt(s) required to post ' u'%(num_messages)d messages to queue ' - u'%(queue_name)s and project %(project)s') - message %= dict(queue_name=queue_name, attempts=attempt+1, + u'"%(queue)s" under project %(project)s') + message %= dict(queue=queue_name, attempts=attempt+1, num_messages=len(ids), project=project) LOG.debug(message) + # Update the counter in preparation for the next batch + # + # NOTE(kgriffs): Due to the unique index on the messages + # collection, competing inserts will fail as a whole, + # and keep retrying until the counter is incremented + # such that the competing marker's will start at a + # unique number, 1 past the max of the messages just + # inserted above. + self._queue_ctrl._inc_counter(queue_name, project, + amount=len(ids)) + return map(str, ids) except pymongo.errors.DuplicateKeyError as ex: # Try again with the remaining messages + # TODO(kgriffs): Record stats of how often retries happen, + # and how many attempts, on average, are required to insert + # messages. + # NOTE(kgriffs): This can be used in conjunction with the # log line, above, that is emitted after all messages have # been posted, to guage how long it is taking for messages @@ -570,43 +507,76 @@ class MessageController(storage.MessageBase): # TODO(kgriffs): Add transaction ID to help match up loglines if attempt == 0: message = _(u'First attempt failed while ' - u'adding messages to queue %s ' - u'for current request') % queue_name + u'adding messages to queue ' + u'"%(queue)s" under project %(project)s') + message %= dict(queue=queue_name, project=project) LOG.debug(message) - # TODO(kgriffs): Record stats of how often retries happen, - # and how many attempts, on average, are required to insert - # messages. + # NOTE(kgriffs): Never retry past the point that competing + # messages expire and are GC'd, since once they are gone, + # the unique index no longer protects us from getting out + # of order, which could cause an observer to miss this + # message. The code below provides a sanity-check to ensure + # this situation can not happen. + elapsed = timeutils.utcnow_ts() - now + if elapsed > MAX_RETRY_POST_DURATION: + message = _(u'Exceeded maximum retry duration for queue ' + u'"%(queue)s" under project %(project)s') + message %= dict(queue=queue_name, project=project) - # NOTE(kgriffs): Slice prepared_messages. We have to interpret - # the error message to get the duplicate key, which gives - # us the marker that had a dupe, allowing us to extrapolate - # how many messages were consumed, since markers are monotonic - # counters. - duplicate_marker = utils.dup_marker_from_error(str(ex)) - failed_index = duplicate_marker - next_marker - - # Put the successful one's IDs into aggregated_results. - succeeded_messages = prepared_messages[:failed_index] - succeeded_ids = [msg['_id'] for msg in succeeded_messages] - - # Results are aggregated across all attempts - if aggregated_results is None: - aggregated_results = succeeded_ids - else: - aggregated_results.extend(succeeded_ids) - - # Retry the remaining messages with a new sequence - # of markers. - prepared_messages = prepared_messages[failed_index:] - next_marker = self._next_marker(queue_name, project) - for index, message in enumerate(prepared_messages): - message['k'] = next_marker + index + LOG.warning(message) + break # Chill out for a moment to mitigate thrashing/thundering self._backoff_sleep(attempt) + # NOTE(kgriffs): Perhaps we failed because a worker crashed + # after inserting messages, but before incrementing the + # counter; that would cause all future requests to stall, + # since they would keep getting the same base marker that is + # conflicting with existing messages, until the messages that + # "won" expire, at which time we would end up reusing markers, + # and that could make some messages invisible to an observer + # that is querying with a marker that is large than the ones + # being reused. + # + # To mitigate this, we apply a heuristic to determine whether + # a counter has stalled. We attempt to increment the counter, + # but only if it hasn't been updated for a few seconds, which + # should mean that nobody is left to update it! + # + # Note that we increment one at a time until the logjam is + # broken, since we don't know how many messages were posted + # by the worker before it crashed. + next_marker = self._queue_ctrl._inc_counter( + queue_name, project, window=COUNTER_STALL_WINDOW) + + # Retry the entire batch with a new sequence of markers. + # + # NOTE(kgriffs): Due to the unique index, and how + # MongoDB works with batch requests, we will never + # end up with a partially-successful update. The first + # document in the batch will fail to insert, and the + # remainder of the documents will not be attempted. + if next_marker is None: + # NOTE(kgriffs): Usually we will end up here, since + # it should be rare that a counter becomes stalled. + next_marker = self._queue_ctrl._get_counter( + queue_name, project) + else: + message = (u'Detected a stalled message counter for ' + u'queue "%(queue)s" under project %(project)s. ' + u'The counter was incremented to %(value)d.') + message %= dict(queue=queue_name, + project=project, + value=next_marker) + + LOG.warning(message) + + for index, message in enumerate(prepared_messages): + message['k'] = next_marker + index + except Exception as ex: # TODO(kgriffs): Query the DB to get the last marker that # made it, and extrapolate from there to figure out what @@ -616,13 +586,13 @@ class MessageController(storage.MessageBase): raise message = _(u'Hit maximum number of attempts (%(max)s) for queue ' - u'%(id)s in project %(project)s') - message %= dict(max=options.CFG.max_attempts, id=queue_name, + u'"%(queue)s" under project %(project)s') + message %= dict(max=options.CFG.max_attempts, queue=queue_name, project=project) LOG.warning(message) - succeeded_ids = map(str, aggregated_results) + succeeded_ids = [] raise exceptions.MessageConflict(queue_name, project, succeeded_ids) @utils.raises_conn_error @@ -647,7 +617,6 @@ class MessageController(storage.MessageBase): return now = timeutils.utcnow_ts() - query['e'] = {'$gt': now} message = self._col.find_one(query) if message is None: diff --git a/marconi/queues/storage/mongodb/queues.py b/marconi/queues/storage/mongodb/queues.py index 4726c883c..2fac32d0a 100644 --- a/marconi/queues/storage/mongodb/queues.py +++ b/marconi/queues/storage/mongodb/queues.py @@ -40,13 +40,20 @@ class QueueController(storage.QueueBase): """Implements queue resource operations using MongoDB. Queues: - Name Field - ------------------ - name -> n - project -> p - counter -> c - metadata -> m + Name Field + ------------------- + name -> n + project -> p + msg counter -> c + metadata -> m + + Message Counter: + + Name Field + ------------------- + value -> v + modified ts -> t """ def __init__(self, *args, **kwargs): @@ -76,6 +83,89 @@ class QueueController(storage.QueueBase): cursor = self._col.find({}, fields={'n': 1, 'p': 1}) return ((doc['n'], doc['p']) for doc in cursor) + def _get_counter(self, name, project=None): + """Retrieves the current message counter value for a given queue. + + This helper is used to generate monotonic pagination + markers that are saved as part of the message + document. + + Note 1: Markers are scoped per-queue and so are *not* + globally unique or globally ordered. + + Note 2: If two or more requests to this method are made + in parallel, this method will return the same counter + value. This is done intentionally so that the caller + can detect a parallel message post, allowing it to + mitigate race conditions between producer and + observer clients. + + :param name: Name of the queue to which the counter is scoped + :param project: Queue's project + :returns: current message counter as an integer + """ + + doc = self._col.find_one({'p': project, 'n': name}, + fields={'c.v': 1, '_id': 0}) + + if doc is None: + raise exceptions.QueueDoesNotExist(name, project) + + return doc['c']['v'] + + def _inc_counter(self, name, project=None, amount=1, window=None): + """Increments the message counter and returns the new value. + + :param name: Name of the queue to which the counter is scoped + :param project: Queue's project name + :param amount: (Default 1) Amount by which to increment the counter + :param window: (Default None) A time window, in seconds, that + must have elapsed since the counter was last updated, in + order to increment the counter. + + :returns: Updated message counter value, or None if window + was specified, and the counter has already been updated + within the specified time period. + + :raises: storage.exceptions.QueueDoesNotExist + """ + now = timeutils.utcnow_ts() + + update = {'$inc': {'c.v': amount}, '$set': {'c.t': now}} + query = {'p': project, 'n': name} + if window is not None: + threshold = now - window + query['c.t'] = {'$lt': threshold} + + while True: + try: + doc = self._col.find_and_modify(query, update, new=True, + fields={'c.v': 1, '_id': 0}) + break + except pymongo.errors.AutoReconnect as ex: + LOG.exception(ex) + + if doc is None: + if window is None: + # NOTE(kgriffs): Since we did not filter by a time window, + # the queue should have been found and updated. Perhaps + # the queue has been deleted? + message = _(u'Failed to increment the message ' + u'counter for queue %(name)s and ' + u'project %(project)s') + message %= dict(name=name, project=project) + + LOG.warning(message) + + raise exceptions.QueueDoesNotExist(name, project) + + # NOTE(kgriffs): Assume the queue existed, but the counter + # was recently updated, causing the range query on 'c.t' to + # exclude the record. + return None + + return doc['c']['v'] + #----------------------------------------------------------------------- # Interface #----------------------------------------------------------------------- @@ -116,7 +206,12 @@ class QueueController(storage.QueueBase): @utils.raises_conn_error def create(self, name, project=None): try: - self._col.insert({'p': project, 'n': name, 'm': {}, 'c': 1}) + # NOTE(kgriffs): Start counting at 1, and assume the first + # message ever posted will succeed and set t to a UNIX + # "modified at" timestamp. + counter = {'v': 1, 't': 0} + + self._col.insert({'p': project, 'n': name, 'm': {}, 'c': counter}) except pymongo.errors.DuplicateKeyError: return False diff --git a/marconi/queues/storage/mongodb/utils.py b/marconi/queues/storage/mongodb/utils.py index 88847bcd1..c40cf341a 100644 --- a/marconi/queues/storage/mongodb/utils.py +++ b/marconi/queues/storage/mongodb/utils.py @@ -17,21 +17,16 @@ import collections import datetime import functools import random -import re from bson import errors as berrors from bson import objectid from bson import tz_util from pymongo import errors -from marconi.common import exceptions import marconi.openstack.common.log as logging from marconi.openstack.common import timeutils from marconi.queues.storage import exceptions as storage_exceptions - -DUP_MARKER_REGEX = re.compile(r'\$queue_marker.*?:\s(\d+)') - # BSON ObjectId gives TZ-aware datetime, so we generate a # TZ-aware UNIX epoch for convenience. EPOCH = datetime.datetime.utcfromtimestamp(0).replace(tzinfo=tz_util.utc) @@ -39,24 +34,6 @@ EPOCH = datetime.datetime.utcfromtimestamp(0).replace(tzinfo=tz_util.utc) LOG = logging.getLogger(__name__) -def dup_marker_from_error(error_message): - """Extracts the duplicate marker from a MongoDB error string. - - :param error_message: raw error message string returned - by mongod on a duplicate key error. - - :raises: marconi.common.exceptions.PatternNotFound - :returns: extracted marker as an integer - """ - match = DUP_MARKER_REGEX.findall(error_message) - if not match: - description = (u'Error message could not be parsed: %s' % - error_message) - raise exceptions.PatternNotFound(description) - - return int(match[-1]) - - def cached_gen(iterable): """Converts the iterable into a caching generator. diff --git a/marconi/tests/__init__.py b/marconi/tests/__init__.py index 2f5d176fa..87b51ff79 100644 --- a/marconi/tests/__init__.py +++ b/marconi/tests/__init__.py @@ -18,5 +18,9 @@ from marconi.tests import base from marconi.tests import helpers -TestBase = base.TestBase +SKIP_SLOW_TESTS = helpers.SKIP_SLOW_TESTS +RUN_SLOW_TESTS = not SKIP_SLOW_TESTS + expect = helpers.expect +is_slow = helpers.is_slow +TestBase = base.TestBase diff --git a/marconi/tests/helpers.py b/marconi/tests/helpers.py index 861e3da1f..0a32fdb1f 100644 --- a/marconi/tests/helpers.py +++ b/marconi/tests/helpers.py @@ -14,10 +14,14 @@ # limitations under the License. import contextlib +import functools +import os import uuid import six +SKIP_SLOW_TESTS = os.environ.get('MARCONI_TEST_SLOW') is None + @contextlib.contextmanager def expect(*exc_type): @@ -120,3 +124,30 @@ def entries(controller, count): for p, q, _, _ in spec: controller.delete(p, q) + + +def is_slow(condition=lambda self: True): + """Decorator to flag slow tests. + + Slow tests will be skipped if MARCONI_TEST_SLOW is set, and + condition(self) returns True. + + :param condition: Function that returns True IFF the test will be slow; + useful for child classes which may modify the behavior of a test + such that it may or may not be slow. + """ + + def decorator(func): + @functools.wraps(func) + def wrapper(self): + if SKIP_SLOW_TESTS and condition(self): + msg = ('Skipping slow test. Set MARCONI_TEST_SLOW ' + 'to enable slow tests.') + + self.skipTest(msg) + + func(self) + + return wrapper + + return decorator diff --git a/marconi/tests/queues/storage/base.py b/marconi/tests/queues/storage/base.py index 956764014..8d5478ba1 100644 --- a/marconi/tests/queues/storage/base.py +++ b/marconi/tests/queues/storage/base.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime import time from testtools import matchers @@ -43,6 +44,10 @@ class ControllerBaseTest(testing.TestBase): self.driver = self.driver_class() self.controller = self.controller_class(self.driver) + def tearDown(self): + timeutils.clear_time_override() + super(ControllerBaseTest, self).tearDown() + class QueueControllerTest(ControllerBaseTest): """Queue Controller base tests.""" @@ -53,10 +58,6 @@ class QueueControllerTest(ControllerBaseTest): self.message_controller = self.driver.message_controller self.claim_controller = self.driver.claim_controller - def tearDown(self): - timeutils.clear_time_override() - super(QueueControllerTest, self).tearDown() - def test_list(self): num = 15 for queue in xrange(num): @@ -132,21 +133,18 @@ class QueueControllerTest(ControllerBaseTest): self.assertNotEqual(oldest, newest) - # NOTE(kgriffs): Ensure "now" is different enough + # NOTE(kgriffs): Ensure is different enough # for the next comparison to work. - timeutils.set_time_override() - timeutils.advance_time_seconds(60) + soon = timeutils.utcnow() + datetime.timedelta(seconds=60) for message_stat in (oldest, newest): created_iso = message_stat['created'] created = timeutils.parse_isotime(created_iso) self.assertThat(timeutils.normalize_time(created), - matchers.LessThan(timeutils.utcnow())) + matchers.LessThan(soon)) self.assertIn('id', message_stat) - timeutils.clear_time_override() - self.assertThat(oldest['created'], matchers.LessThan(newest['created'])) @@ -188,6 +186,9 @@ class MessageControllerTest(ControllerBaseTest): queue_name = 'test_queue' controller_base_class = storage.MessageBase + # Specifies how often expired messages are purged, in sec. + gc_interval = 0 + def setUp(self): super(MessageControllerTest, self).setUp() @@ -342,7 +343,8 @@ class MessageControllerTest(ControllerBaseTest): project=self.project, claim=cid) - def test_expired_message(self): + @testing.is_slow(condition=lambda self: self.gc_interval != 0) + def test_expired_messages(self): messages = [{'body': 3.14, 'ttl': 0}] [msgid] = self.controller.post(self.queue_name, messages, @@ -353,14 +355,16 @@ class MessageControllerTest(ControllerBaseTest): project=self.project, client_uuid='my_uuid') + time.sleep(self.gc_interval) + with testing.expect(storage.exceptions.DoesNotExist): self.controller.get(self.queue_name, msgid, project=self.project) - countof = self.queue_controller.stats(self.queue_name, - project=self.project) + stats = self.queue_controller.stats(self.queue_name, + project=self.project) - self.assertEquals(countof['messages']['free'], 0) + self.assertEquals(stats['messages']['free'], 0) def test_bad_id(self): # NOTE(cpp-cabrera): A malformed ID should result in an empty diff --git a/setup.cfg b/setup.cfg index fbd9a626a..8366afd00 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,7 +24,6 @@ packages = [entry_points] console_scripts = - marconi-gc = marconi.cmd.gc:run marconi-server = marconi.cmd.server:run marconi.storage = diff --git a/tests/unit/queues/storage/test_impl_mongodb.py b/tests/unit/queues/storage/test_impl_mongodb.py index 30d6b1d5e..81453740f 100644 --- a/tests/unit/queues/storage/test_impl_mongodb.py +++ b/tests/unit/queues/storage/test_impl_mongodb.py @@ -21,8 +21,9 @@ from pymongo import cursor import pymongo.errors from testtools import matchers -from marconi.common import exceptions +from marconi.openstack.common import timeutils from marconi.queues import storage +from marconi.queues.storage import exceptions from marconi.queues.storage import mongodb from marconi.queues.storage.mongodb import controllers from marconi.queues.storage.mongodb import options as mongodb_options @@ -33,28 +34,6 @@ from marconi.tests.queues.storage import base class MongodbUtilsTest(testing.TestBase): - def test_dup_marker_from_error(self): - error_message = ('E11000 duplicate key error index: ' - 'marconi.messages.$queue_marker dup key: ' - '{ : "queue", : "project", : 3 }') - - marker = utils.dup_marker_from_error(error_message) - self.assertEquals(marker, 3) - - error_message = ('E11000 duplicate key error index: ' - 'marconi.messages.$x_y dup key: ' - '{ : "queue", : "project", : 3 }') - - self.assertRaises(exceptions.PatternNotFound, - utils.dup_marker_from_error, error_message) - - error_message = ('E11000 duplicate key error index: ' - 'marconi.messages.$queue_marker dup key: ' - '{ : ObjectId("51adff46b100eb85d8a93a2d") }') - - self.assertRaises(exceptions.PatternNotFound, - utils.dup_marker_from_error, error_message) - def test_calculate_backoff(self): sec = utils.calculate_backoff(0, 10, 2, 0) self.assertEquals(sec, 0) @@ -111,6 +90,7 @@ class MongodbQueueTests(base.QueueControllerTest): def tearDown(self): self.controller._col.drop() + self.message_controller._col.drop() super(MongodbQueueTests, self).tearDown() def test_indexes(self): @@ -143,6 +123,9 @@ class MongodbMessageTests(base.MessageControllerTest): driver_class = mongodb.Driver controller_class = controllers.MessageController + # NOTE(kgriffs): MongoDB's TTL scavenger only runs once a minute + gc_interval = 60 + def setUp(self): if not os.environ.get('MONGODB_TEST_LIVE'): self.skipTest('No MongoDB instance running') @@ -152,11 +135,9 @@ class MongodbMessageTests(base.MessageControllerTest): def tearDown(self): self.controller._col.drop() + self.queue_controller._col.drop() super(MongodbMessageTests, self).tearDown() - def _count_expired(self, queue, project=None): - return self.controller._count_expired(queue, project) - def test_indexes(self): col = self.controller._col indexes = col.index_information() @@ -165,58 +146,121 @@ class MongodbMessageTests(base.MessageControllerTest): self.assertIn('queue_marker', indexes) self.assertIn('counting', indexes) - def test_next_marker(self): + def test_message_counter(self): queue_name = 'marker_test' iterations = 10 self.queue_controller.create(queue_name) - seed_marker1 = self.controller._next_marker(queue_name) + seed_marker1 = self.queue_controller._get_counter(queue_name) self.assertEqual(seed_marker1, 1, 'First marker is 1') for i in range(iterations): self.controller.post(queue_name, [{'ttl': 60}], 'uuid') - marker1 = self.controller._next_marker(queue_name) - marker2 = self.controller._next_marker(queue_name) - marker3 = self.controller._next_marker(queue_name) + + marker1 = self.queue_controller._get_counter(queue_name) + marker2 = self.queue_controller._get_counter(queue_name) + marker3 = self.queue_controller._get_counter(queue_name) + self.assertEqual(marker1, marker2) self.assertEqual(marker2, marker3) - self.assertEqual(marker1, i + 2) - def test_remove_expired(self): - num_projects = 10 - num_queues = 10 - messages_per_queue = 100 + new_value = self.queue_controller._inc_counter(queue_name) + self.assertIsNotNone(new_value) - projects = ['gc-test-project-{0}'.format(i) - for i in range(num_projects)] + value_before = self.queue_controller._get_counter(queue_name) + new_value = self.queue_controller._inc_counter(queue_name) + self.assertIsNotNone(new_value) + value_after = self.queue_controller._get_counter(queue_name) + self.assertEquals(value_after, value_before + 1) - queue_names = ['gc-test-{0}'.format(i) for i in range(num_queues)] - client_uuid = 'b623c53c-cf75-11e2-84e1-a1187188419e' - messages = [{'ttl': 0, 'body': str(i)} - for i in range(messages_per_queue)] + value_before = value_after + new_value = self.queue_controller._inc_counter(queue_name, amount=7) + value_after = self.queue_controller._get_counter(queue_name) + self.assertEquals(value_after, value_before + 7) + self.assertEquals(value_after, new_value) - for project in projects: - for queue in queue_names: - self.queue_controller.create(queue, project) - self.controller.post(queue, messages, client_uuid, project) + reference_value = value_after - self.controller.remove_expired() + unchanged = self.queue_controller._inc_counter(queue_name, window=10) + self.assertIsNone(unchanged) - for project in projects: - for queue in queue_names: - query = {'q': queue, 'p': project} + # TODO(kgriffs): Pass utcnow to work around bug + # in set_time_override until we merge the fix in + # from upstream. + timeutils.set_time_override(timeutils.utcnow()) - cursor = self.driver.db.messages.find(query) - count = cursor.count() + timeutils.advance_time_seconds(10) + changed = self.queue_controller._inc_counter(queue_name, window=5) + self.assertEquals(changed, reference_value + 1) + timeutils.clear_time_override() - # Expect that the most recent message for each queue - # will not be removed. - self.assertEquals(count, 1) + def test_race_condition_on_post(self): + queue_name = 'marker_test' + self.queue_controller.create(queue_name) - message = next(cursor) - self.assertEquals(message['k'], messages_per_queue) + expected_messages = [ + { + 'ttl': 60, + 'body': { + 'event': 'BackupStarted', + 'backupId': 'c378813c-3f0b-11e2-ad92-7823d2b0f3ce', + }, + }, + { + 'ttl': 60, + 'body': { + 'event': 'BackupStarted', + 'backupId': 'd378813c-3f0b-11e2-ad92-7823d2b0f3ce', + }, + }, + { + 'ttl': 60, + 'body': { + 'event': 'BackupStarted', + 'backupId': 'e378813c-3f0b-11e2-ad92-7823d2b0f3ce', + }, + }, + ] + + uuid = '97b64000-2526-11e3-b088-d85c1300734c' + + # NOTE(kgriffs): Patch _inc_counter so it is a noop, so that + # the second time we post, we will get a collision. This simulates + # what happens when we have parallel requests and the "winning" + # requests hasn't gotten around to calling _inc_counter before the + # "losing" request attempts to insert it's batch of messages. + with mock.patch.object(mongodb.queues.QueueController, + '_inc_counter', autospec=True) as method: + + method.return_value = 2 + messages = expected_messages[:1] + created = list(self.controller.post(queue_name, messages, uuid)) + self.assertEquals(len(created), 1) + + # Force infinite retries + if testing.RUN_SLOW_TESTS: + method.return_value = None + + with testing.expect(exceptions.MessageConflict): + self.controller.post(queue_name, messages, uuid) + + created = list(self.controller.post(queue_name, + expected_messages[1:], + uuid)) + + self.assertEquals(len(created), 2) + + expected_ids = [m['body']['backupId'] for m in expected_messages] + + interaction = self.controller.list(queue_name, client_uuid=uuid, + echo=True) + actual_messages = list(next(interaction)) + self.assertEquals(len(actual_messages), len(expected_messages)) + actual_ids = [m['body']['backupId'] for m in actual_messages] + + self.assertEquals(actual_ids, expected_ids) def test_empty_queue_exception(self): queue_name = 'empty-queue-test' @@ -244,6 +288,11 @@ class MongodbClaimTests(base.ClaimControllerTest): super(MongodbClaimTests, self).setUp() self.load_conf('wsgi_mongodb.conf') + def tearDown(self): + self.message_controller._col.drop() + self.queue_controller._col.drop() + super(MongodbClaimTests, self).tearDown() + def test_claim_doesnt_exist(self): """Verifies that operations fail on expired/missing claims.