Make cloudkitty-processor run several workers

This makes the processor run several workers in separate processes by default, leading to a big performance increase. The maximal number of workers can be configured. Work items: * Use cotyledon to spawn several workers. * Remove eventlet monkey patching and use the "threading" executor for messaging in order to avoid conflict with cotyledon internal threads. Story: 2005423 Task: 30447 Change-Id: I7e2a77cb7d68afb87274fb44fb208306c3b32473
2019-04-12 10:00:54 +02:00
parent 394177fb74
commit 1e60561ad7
6 changed files with 62 additions and 33 deletions
--- a/cloudkitty/cli/processor.py
+++ b/cloudkitty/cli/processor.py
@@ -27,11 +27,7 @@ def main():
    # before the prepare_service(), making cfg.CONF returning default values
    # systematically.
    from cloudkitty import orchestrator
-    processor = orchestrator.Orchestrator()
-    try:
-        processor.process()
-    except KeyboardInterrupt:
-        processor.terminate()
+    orchestrator.OrchestratorServiceManager().run()


 if __name__ == '__main__':
--- a/cloudkitty/messaging.py
+++ b/cloudkitty/messaging.py
@@ -66,7 +66,7 @@ def get_server(target=None, endpoints=None):
    if not target:
        target = get_target()
    return oslo_messaging.get_rpc_server(transport, target,
-                                         endpoints, executor='eventlet',
+                                         endpoints, executor='threading',
                                         access_policy=access_policy)


--- a/cloudkitty/orchestrator.py
+++ b/cloudkitty/orchestrator.py
@@ -16,9 +16,12 @@
 # @author: Stéphane Albert
 #
 import decimal
+import multiprocessing
 import random
+import sys
+import time

-import eventlet
+import cotyledon
 from oslo_concurrency import lockutils
 from oslo_config import cfg
 from oslo_log import log as logging
@@ -36,18 +39,24 @@ from cloudkitty import storage_state as state
 from cloudkitty import transformer
 from cloudkitty import utils as ck_utils

-eventlet.monkey_patch()

 LOG = logging.getLogger(__name__)

 CONF = cfg.CONF

 orchestrator_opts = [
-    cfg.StrOpt('coordination_url',
-               secret=True,
-               help='Coordination driver URL',
-               default='file:///var/lib/cloudkitty/locks'),
+    cfg.StrOpt(
+        'coordination_url',
+        secret=True,
+        help='Coordination driver URL',
+        default='file:///var/lib/cloudkitty/locks'),
+    cfg.IntOpt(
+        'max_workers',
+        default=multiprocessing.cpu_count(),
+        min=1,
+        help='Max nb of workers to run. Defaults to the nb of available CPUs'),
 ]
+
 CONF.register_opts(orchestrator_opts, group='orchestrator')

 CONF.import_opt('backend', 'cloudkitty.fetcher', 'fetcher')
@@ -152,12 +161,13 @@ class APIWorker(BaseWorker):


 class Worker(BaseWorker):
-    def __init__(self, collector, storage, tenant_id):
+    def __init__(self, collector, storage, tenant_id, worker_id):
        self._collector = collector
        self._storage = storage
        self._period = CONF.collect.period
        self._wait_time = CONF.collect.wait_periods * self._period
        self._tenant_id = tenant_id
+        self._worker_id = worker_id
        self._conf = ck_utils.load_conf(CONF.collect.metrics_conf)
        self._state = state.StateManager()

@@ -201,25 +211,28 @@ class Worker(BaseWorker):
                        raise
                    except Exception as e:
                        LOG.warning(
-                            '[%(scope_id)s] Error while collecting metric '
-                            '%(metric)s: %(error)s. Retrying on next '
-                            'collection cycle.',
-                            {
-                                'scope_id': self._tenant_id,
-                                'metric': metric,
-                                'error': e,
-                            },
+                            '[scope: {scope}, worker: {worker}] Error while'
+                            'collecting metric {metric}: {error}. Retrying on '
+                            'next collection cycle.'.format(
+                                scope=self._tenant_id,
+                                worker=self._worker_id,
+                                metric=metric,
+                                error=e,
+                            ),
                        )
                        # FIXME(peschk_l): here we just exit, and the
                        # collection will be retried during the next collect
                        # cycle. In the future, we should implement a retrying
                        # system in workers
-                        return
+                        sys.exit(0)
                except collector.NoDataCollected:
                    LOG.info(
-                        '[{}] No data collected for metric {} '
-                        'at timestamp {}'.format(
-                            self._tenant_id, metric, ck_utils.ts2dt(timestamp))
+                        '[scope: {scope}, worker: {worker}] No data collected '
+                        'for metric {metric} at timestamp {ts}'.format(
+                            scope=self._tenant_id,
+                            worker=self._worker_id,
+                            metric=metric,
+                            ts=ck_utils.ts2dt(timestamp))
                    )
                else:
                    # Rating
@@ -236,8 +249,11 @@ class Worker(BaseWorker):
            self._state.set_state(self._tenant_id, timestamp)


-class Orchestrator(object):
-    def __init__(self):
+class Orchestrator(cotyledon.Service):
+    def __init__(self, worker_id):
+        self._worker_id = worker_id
+        super(Orchestrator, self).__init__(self._worker_id)
+
        self.fetcher = driver.DriverManager(
            FETCHERS_NAMESPACE,
            CONF.fetcher.backend,
@@ -287,11 +303,13 @@ class Orchestrator(object):
        # pending_states = self._rating_endpoint.get_module_state()
        pass

-    def process(self):
+    def run(self):
+        LOG.debug('Started worker {}.'.format(self._worker_id))
        while True:
            self.tenants = self.fetcher.get_tenants()
            random.shuffle(self.tenants)
-            LOG.info('Tenants loaded for fetcher %s', self.fetcher.name)
+            LOG.info('[Worker: {w}] Tenants loaded for fetcher {f}'.format(
+                w=self._worker_id, f=self.fetcher.name))

            for tenant_id in self.tenants:

@@ -303,16 +321,23 @@ class Orchestrator(object):
                            self.collector,
                            self.storage,
                            tenant_id,
+                            self._worker_id,
                        )
                        worker.run()

                    lock.release()

-                # NOTE(sheeprine): Slow down looping if all tenants are
-                # being processed
-                eventlet.sleep(1)
            # FIXME(sheeprine): We may cause a drift here
-            eventlet.sleep(CONF.collect.period)
+            time.sleep(CONF.collect.period)

    def terminate(self):
+        LOG.debug('Terminating worker {}...'.format(self._worker_id))
        self.coord.stop()
+        LOG.debug('Terminated worker {}.'.format(self._worker_id))
+
+
+class OrchestratorServiceManager(cotyledon.ServiceManager):
+
+    def __init__(self):
+        super(OrchestratorServiceManager, self).__init__()
+        self.service_id = self.add(Orchestrator, workers=4)
--- a/lower-constraints.txt
+++ b/lower-constraints.txt
@@ -30,6 +30,7 @@ voluptuous==0.11.1 # BSD-3
 influxdb==5.1.0 # MIT
 Flask==1.0.2 # BSD
 Flask-RESTful==0.3.5 # BSD
+cotyledon==1.5.0 # Apache-2.0

 # test-requirements
 coverage==3.6 # Apache-2.0
--- a/releasenotes/notes/make-processor-run-several-workers-02597b0f77687ef3.yaml
+++ b/releasenotes/notes/make-processor-run-several-workers-02597b0f77687ef3.yaml
@@ -0,0 +1,6 @@
+---
+features:
+  - |
+    The processor is now able to run several parallel workers. By default, one
+    worker is spawned for each available CPU. Workers can be limited through the
+    ``max_workers`` option of the ``orchestrator`` section.
--- a/requirements.txt
+++ b/requirements.txt
@@ -33,3 +33,4 @@ influxdb>=5.1.0,!=5.2.0,!=5.2.1,!=5.2.2;python_version<'3.0'  # MIT
 influxdb>=5.1.0;python_version>='3.0'  # MIT
 Flask>=1.0.2 # BSD
 Flask-RESTful>=0.3.5 # BSD
+cotyledon>=1.5.0 # Apache-2.0