Refactor forwarder

Take out the use of Transaction Manager and Metric Transactions. It now queues the measurements in a list and flushes them to the API on a configured time or size. Change-Id: Icbb4681b578c12dd17b53ba5c582dc64412ad533
2016-03-15 09:54:05 -06:00 · 2016-03-15 09:54:05 -06:00 · 633dd18196
parent 673f3cdf74
commit 633dd18196
2 changed files with 56 additions and 371 deletions
--- a/monasca_agent/forwarder/daemon.py
+++ b/monasca_agent/forwarder/daemon.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# (C) Copyright 2015 Hewlett Packard Enterprise Development Company LP
+# (C) Copyright 2015,2016 Hewlett Packard Enterprise Development Company LP
 """
    Licensed under Simplified BSD License (see LICENSE)
    (C) Boxed Ice 2010 all rights reserved
@ -7,7 +7,6 @@
 """

 # Standard imports
-import datetime
 import logging
 import signal
 import socket
@ -35,81 +34,50 @@ import monasca_agent.common.config as cfg
 import monasca_agent.common.metrics as metrics
 import monasca_agent.common.util as util
 import monasca_agent.forwarder.api.monasca_api as mon
-import monasca_agent.forwarder.transaction as transaction

 log = logging.getLogger('forwarder')

-# Maximum delay before replaying a transaction
-MAX_WAIT_FOR_REPLAY = datetime.timedelta(seconds=90)
+# Max amount of iterations to wait to meet min batch size before flushing
+MAX_FLUSH_ATTEMPTS = 3

-# Maximum queue size in bytes (when this is reached, old messages are dropped)
-MAX_QUEUE_SIZE = 30 * 1024 * 1024  # 30MB
+MIN_BATCH_SIZE = 200

-THROTTLING_DELAY = datetime.timedelta(microseconds=1000000 / 2)  # 2 msg/second
+message_batch = []

-
-class StatusHandler(tornado.web.RequestHandler):
-
-    def get(self):
-        threshold = int(self.get_argument('threshold', -1))
-
-        m = transaction.MetricTransaction.get_tr_manager()
-
-        self.write(
-            "<table><tr><td>Id</td><td>Size</td><td>Error count</td><td>Next flush</td></tr>")
-        transactions = m.get_transactions()
-        for tr in transactions:
-            self.write("<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>" %
-                       (tr.get_id(), tr.get_size(), tr.get_error_count(), tr.get_next_flush()))
-        self.write("</table>")
-
-        if threshold >= 0:
-            if len(transactions) > threshold:
-                self.set_status(503)
+# In seconds
+FLUSH_INTERVAL = 1


 class AgentInputHandler(tornado.web.RequestHandler):
-
    def post(self):
-        """Read the message and forward it to the intake
-            The message is expected to follow the format:
-
+        """Read the message and add it to the batch.
+            Batch will be sent to Monasca API once the batch size or max wait time
+            has been reached. Whichever one first.
        """
-        # read the message it should be a list of
-        # monasca_agent.common.metrics.Measurements expressed as a dict
+        global message_batch
+
        msg = tornado.escape.json_decode(self.request.body)
        try:
-            measurements = [metrics.Measurement(**m) for m in msg]
+            message_batch.extend([metrics.Measurement(**m) for m in msg])
        except Exception:
            log.exception('Error parsing body of Agent Input')
            raise tornado.web.HTTPError(500)

-        headers = self.request.headers
-
-        if len(measurements) > 0:
-            # Setup a transaction for this message
-            tr = transaction.MetricTransaction(measurements, headers)
-        else:
-            raise tornado.web.HTTPError(500)
-
-        self.write("Transaction: %s" % tr.get_id())
-

 class Forwarder(tornado.web.Application):
-
    def __init__(self, port, agent_config, skip_ssl_validation=False,
                 use_simple_http_client=False):
+
+        self._unflushed_iterations = 0
+        self._endpoint = mon.MonascaAPI(agent_config)
+
+        self._ioloop = None
+
        self._port = int(port)
-        self._agent_config = agent_config
-        self.flush_interval = (int(agent_config.get('check_freq')) / 2) * 1000
-        self._metrics = {}
-        transaction.MetricTransaction.set_application(self)
-        transaction.MetricTransaction.set_endpoints(mon.MonascaAPI(agent_config))
-        self._tr_manager = transaction.TransactionManager(MAX_WAIT_FOR_REPLAY,
-                                                          MAX_QUEUE_SIZE,
-                                                          THROTTLING_DELAY,
-                                                          agent_config)
-        transaction.MetricTransaction.set_tr_manager(self._tr_manager)
+        self._flush_interval = FLUSH_INTERVAL * 1000
+        self._non_local_traffic = agent_config.get("non_local_traffic", False)
+
+        logging.getLogger().setLevel(agent_config.get('log_level', logging.INFO))

        self.skip_ssl_validation = skip_ssl_validation or agent_config.get(
            'skip_ssl_validation', False)
@ -117,13 +85,6 @@ class Forwarder(tornado.web.Application):
        if self.skip_ssl_validation:
            log.info("Skipping SSL hostname validation, useful when using a transparent proxy")

-    def _post_metrics(self):
-
-        if len(self._metrics) > 0:
-            transaction.MetricTransaction(self._metrics, headers={'Content-Type': 'application/json'})
-            self._metrics = {}
-
-    # todo why is the tornado logging method overridden? Perhaps ditch this.
    def log_request(self, handler):
        """Override the tornado logging method.
        If everything goes well, log level is DEBUG.
@ -139,11 +100,9 @@ class Forwarder(tornado.web.Application):
        log_method("%d %s %.2fms", handler.get_status(),
                   handler._request_summary(), request_time)

-    def run(self):
+    def _add_tornado_handlers(self):
        handlers = [
-            (r"/intake/?", AgentInputHandler),
-            (r"/api/v1/series/?", AgentInputHandler),
-            (r"/status/?", StatusHandler),
+            (r"/intake/?", AgentInputHandler)
        ]

        settings = dict(
@ -153,14 +112,12 @@ class Forwarder(tornado.web.Application):
            log_function=self.log_request
        )

-        non_local_traffic = self._agent_config.get("non_local_traffic", False)
-
        tornado.web.Application.__init__(self, handlers, **settings)
-        http_server = tornado.httpserver.HTTPServer(self)

+    def _bind_http_server(self, http_server):
        try:
            # non_local_traffic must be == True to match, not just some non-false value
-            if non_local_traffic is True:
+            if self._non_local_traffic is True:
                http_server.listen(self._port)
            else:
                # localhost in lieu of 127.0.0.1 to support IPv6
@ -186,26 +143,43 @@ class Forwarder(tornado.web.Application):

        log.info("Listening on port %d" % self._port)

-        # Register callbacks
-        self.mloop = util.get_tornado_ioloop()
+    def _post_metrics(self):
+        global message_batch
+        self._endpoint.post_metrics(message_batch)
+        log.info("wrote {}".format(len(message_batch)))
+        message_batch = []
+        self._unflushed_iterations = 0

-        logging.getLogger().setLevel(self._agent_config.get('log_level', logging.INFO))
-
-        def flush_trs():
+    def flush(self):
+        if not message_batch:
+            return
+        if len(message_batch) >= MIN_BATCH_SIZE or self._unflushed_iterations >= MAX_FLUSH_ATTEMPTS:
            self._post_metrics()
-            self._tr_manager.flush()
+        else:
+            self._unflushed_iterations += 1

-        tr_sched = tornado.ioloop.PeriodicCallback(
-            flush_trs, self.flush_interval, io_loop=self.mloop)
+    def run(self):
+        log.info("Forwarder RUN")
+        self._add_tornado_handlers()

-        # Start everything
-        tr_sched.start()
+        http_server = tornado.httpserver.HTTPServer(self)
+        self._bind_http_server(http_server)
+
+        self._ioloop = util.get_tornado_ioloop()
+
+        callback = tornado.ioloop.PeriodicCallback(self.flush,
+                                                   self._flush_interval,
+                                                   io_loop=self._ioloop)
+
+        callback.start()
+
+        self._ioloop.start()

-        self.mloop.start()
        log.info("Stopped")

    def stop(self):
-        self.mloop.stop()
+        if self._ioloop:
+            self._ioloop.stop()


 def init_forwarder(skip_ssl_validation=False, use_simple_http_client=False):
--- a/monasca_agent/forwarder/transaction.py
+++ b/monasca_agent/forwarder/transaction.py
@ -1,289 +0,0 @@
-# (C) Copyright 2015 Hewlett Packard Enterprise Development Company LP
-# stdlib
-from datetime import datetime
-from datetime import timedelta
-import logging
-from operator import attrgetter
-import sys
-import time
-
-# project
-import monasca_agent.common.check_status as check_status
-import monasca_agent.common.metrics as metrics
-import monasca_agent.common.util as util
-
-log = logging.getLogger(__name__)
-
-FLUSH_LOGGING_PERIOD = 20
-FLUSH_LOGGING_INITIAL = 5
-
-
-class Transaction(object):
-
-    def __init__(self):
-
-        self._id = None
-        self._error_count = 0
-        self._next_flush = datetime.now()
-        self._size = None
-
-    def get_id(self):
-        return self._id
-
-    def set_id(self, new_id):
-        assert self._id is None
-        self._id = new_id
-
-    def inc_error_count(self):
-        self._error_count += 1
-
-    def get_error_count(self):
-        return self._error_count
-
-    def get_size(self):
-        if self._size is None:
-            self._size = sys.getsizeof(self)
-
-        return self._size
-
-    def get_next_flush(self):
-        return self._next_flush
-
-    def compute_next_flush(self, max_delay):
-        # Transactions are replayed, try to send them faster for newer transactions
-        # Send them every MAX_WAIT_FOR_REPLAY at most
-        td = timedelta(seconds=self._error_count * 20)
-        if td > max_delay:
-            td = max_delay
-
-        newdate = datetime.now() + td
-        self._next_flush = newdate.replace(microsecond=0)
-
-    def time_to_flush(self, now=datetime.now()):
-        return self._next_flush < now
-
-    def flush(self):
-        raise NotImplementedError("To be implemented in a subclass")
-
-
-class MetricTransaction(Transaction):
-
-    _application = None
-    _trManager = None
-    _endpoints = []
-
-    @classmethod
-    def set_application(cls, app):
-        cls._application = app
-
-    @classmethod
-    def set_tr_manager(cls, manager):
-        cls._trManager = manager
-
-    @classmethod
-    def get_tr_manager(cls):
-        return cls._trManager
-
-    @classmethod
-    def set_endpoints(cls, endpoint):
-        # todo we only have one endpoint option, generalize it better
-        # the immediate use case for two endpoints could be our own monitoring boxes, they could send to
-        # the main api and mini-mon api
-        cls._endpoints.append(endpoint)
-
-    def __init__(self, data, headers):
-        self._data = data
-        self._headers = headers
-
-        # Call after data has been set (size is computed in Transaction's init)
-        Transaction.__init__(self)
-
-        # Insert the transaction in the Manager
-        self._trManager.append(self)
-        log.debug("Created transaction %d" % self.get_id())
-
-    def __sizeof__(self):
-        return sys.getsizeof(self._data)
-
-    def flush(self):
-        try:
-            for endpoint in self._endpoints:
-                endpoint.post_metrics(self._data)
-        except Exception:
-            log.exception('Error flushing metrics to remote endpoints')
-            self._trManager.tr_error(self)
-        else:
-            self._trManager.tr_success(self)
-        self._trManager.flush_next()
-
-
-class TransactionManager(util.Dimensions):
-    """Holds any transaction derived object list and make sure they
-       are all committed, without exceeding parameters (throttling, memory consumption)
-    """
-
-    def __init__(self, max_wait_for_replay, max_queue_size, throttling_delay, agent_config):
-        super(TransactionManager, self).__init__(agent_config)
-        self._MAX_WAIT_FOR_REPLAY = max_wait_for_replay
-        self._MAX_QUEUE_SIZE = max_queue_size
-        self._THROTTLING_DELAY = throttling_delay
-
-        self._flush_without_ioloop = False  # useful for tests
-
-        self._transactions = []  # List of all non committed transactions
-        self._total_count = 0  # Maintain size/count not to recompute it everytime
-        self._total_size = 0
-        self._flush_count = 0
-        self._transactions_received = 0
-        self._transactions_flushed = 0
-
-        # Global counter to assign a number to each transaction: we may have an issue
-        #  if this overlaps
-        self._counter = 0
-
-        self._trs_to_flush = None  # Current transactions being flushed
-        self._last_flush = datetime.now()  # Last flush (for throttling)
-
-        # Track an initial status message.
-        check_status.ForwarderStatus().persist()
-
-    def get_transactions(self):
-        return self._transactions
-
-    def print_queue_stats(self):
-        log.debug("Queue size: at %s, %s transaction(s), %s KB" %
-                  (time.time(), self._total_count, (self._total_size / 1024)))
-
-    def get_tr_id(self):
-        self._counter += 1
-        return self._counter
-
-    def append(self, tr):
-
-        # Give the transaction an id
-        tr.set_id(self.get_tr_id())
-
-        # Check the size
-        tr_size = tr.get_size()
-
-        log.debug("New transaction to add, total size of queue would be: %s KB" %
-                  ((self._total_size + tr_size) / 1024))
-
-        if (self._total_size + tr_size) > self._MAX_QUEUE_SIZE:
-            log.warn("Queue is too big, removing old transactions...")
-            new_trs = sorted(self._transactions, key=attrgetter('_next_flush'), reverse=True)
-            for tr2 in new_trs:
-                if (self._total_size + tr_size) > self._MAX_QUEUE_SIZE:
-                    self._transactions.remove(tr2)
-                    self._total_count -= 1
-                    self._total_size = self._total_size - tr2.get_size()
-                    log.warn("Removed transaction %s from queue" % tr2.get_id())
-
-        # Done
-        self._transactions.append(tr)
-        self._total_count += 1
-        self._transactions_received += 1
-        self._total_size = self._total_size + tr_size
-
-        log.debug("Transaction %s added" % (tr.get_id()))
-        self.print_queue_stats()
-
-    def flush(self):
-
-        if self._trs_to_flush is not None:
-            log.debug("A flush is already in progress, not doing anything")
-            return
-
-        to_flush = []
-        # Do we have something to do ?
-        now = datetime.now()
-        for tr in self._transactions:
-            if tr.time_to_flush(now):
-                to_flush.append(tr)
-
-        count = len(to_flush)
-        should_log = self._flush_count + 1 <= FLUSH_LOGGING_INITIAL or \
-            (self._flush_count + 1) % FLUSH_LOGGING_PERIOD == 0
-        if count > 0:
-            if should_log:
-                log.info("Flushing %s transaction%s during flush #%s" %
-                         (count, util.plural(count), str(self._flush_count + 1)))
-            else:
-                log.debug("Flushing %s transaction%s during flush #%s" %
-                          (count, util.plural(count), str(self._flush_count + 1)))
-
-            timer = util.Timer()
-            self._trs_to_flush = to_flush
-            self.flush_next()
-            # The emit time is reported on the next run.
-            dimensions = self._set_dimensions({'component': 'monasca-agent', 'service': 'monitoring'})
-            emit_measurement = metrics.Measurement('monasca.emit_time_sec', time.time(), timer.step(), dimensions)
-            MetricTransaction([emit_measurement], headers={'Content-Type': 'application/json'})
-        else:
-            if should_log:
-                log.info("No transaction to flush during flush #%s" % str(self._flush_count + 1))
-            else:
-                log.debug("No transaction to flush during flush #%s" % str(self._flush_count + 1))
-
-        if self._flush_count + 1 == FLUSH_LOGGING_INITIAL:
-            log.info("First flushes done, next flushes will be logged every %s flushes." %
-                     FLUSH_LOGGING_PERIOD)
-
-        self._flush_count += 1
-
-        check_status.ForwarderStatus(queue_length=self._total_count,
-                                     queue_size=self._total_size,
-                                     flush_count=self._flush_count,
-                                     transactions_received=self._transactions_received,
-                                     transactions_flushed=self._transactions_flushed).persist()
-
-    def flush_next(self):
-
-        if len(self._trs_to_flush) > 0:
-
-            td = self._last_flush + self._THROTTLING_DELAY - datetime.now()
-            # Python 2.7 has this built in, python < 2.7 don't...
-            if hasattr(td, 'total_seconds'):
-                delay = td.total_seconds()
-            else:
-                delay = (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10 ** 6) / 10.0 ** 6
-
-            if delay <= 0:
-                tr = self._trs_to_flush.pop()
-                self._last_flush = datetime.now()
-                log.debug("Flushing transaction %d" % tr.get_id())
-                try:
-                    tr.flush()
-                except Exception as e:
-                    log.exception(e)
-                    self.tr_error(tr)
-                    self.flush_next()
-            else:
-                # Wait a little bit more
-                tornado_ioloop = util.get_tornado_ioloop()
-                if tornado_ioloop._running:
-                    tornado_ioloop.add_timeout(time.time() + delay, lambda: self.flush_next())
-                elif self._flush_without_ioloop:
-                    # Tornado is no started (ie, unittests), do it manually: BLOCKING
-                    time.sleep(delay)
-                    self.flush_next()
-        else:
-            self._trs_to_flush = None
-
-    def tr_error(self, tr):
-        tr.inc_error_count()
-        tr.compute_next_flush(self._MAX_WAIT_FOR_REPLAY)
-        log.warn(
-            "Transaction %d in error (%s error%s), it will be replayed after %s" %
-            (tr.get_id(),
-             tr.get_error_count(),
-             util.plural(tr.get_error_count()),
-             tr.get_next_flush()))
-
-    def tr_success(self, tr):
-        log.debug("Transaction %d completed" % tr.get_id())
-        self._transactions.remove(tr)
-        self._total_count += -1
-        self._total_size += - tr.get_size()
-        self._transactions_flushed += 1
-        self.print_queue_stats()