Make cloudkitty-processor run several workers

This makes the processor run several workers in separate processes by
default, leading to a big performance increase. The maximal number of workers
can be configured. Work items:

* Use cotyledon to spawn several workers.

* Remove eventlet monkey patching and use the "threading" executor for
  messaging in order to avoid conflict with cotyledon internal threads.

Story: 2005423
Task: 30447
Change-Id: I7e2a77cb7d68afb87274fb44fb208306c3b32473
This commit is contained in:
Luka Peschke
2019-04-12 10:00:54 +02:00
parent 394177fb74
commit 1e60561ad7
6 changed files with 62 additions and 33 deletions

View File

@@ -27,11 +27,7 @@ def main():
# before the prepare_service(), making cfg.CONF returning default values # before the prepare_service(), making cfg.CONF returning default values
# systematically. # systematically.
from cloudkitty import orchestrator from cloudkitty import orchestrator
processor = orchestrator.Orchestrator() orchestrator.OrchestratorServiceManager().run()
try:
processor.process()
except KeyboardInterrupt:
processor.terminate()
if __name__ == '__main__': if __name__ == '__main__':

View File

@@ -66,7 +66,7 @@ def get_server(target=None, endpoints=None):
if not target: if not target:
target = get_target() target = get_target()
return oslo_messaging.get_rpc_server(transport, target, return oslo_messaging.get_rpc_server(transport, target,
endpoints, executor='eventlet', endpoints, executor='threading',
access_policy=access_policy) access_policy=access_policy)

View File

@@ -16,9 +16,12 @@
# @author: Stéphane Albert # @author: Stéphane Albert
# #
import decimal import decimal
import multiprocessing
import random import random
import sys
import time
import eventlet import cotyledon
from oslo_concurrency import lockutils from oslo_concurrency import lockutils
from oslo_config import cfg from oslo_config import cfg
from oslo_log import log as logging from oslo_log import log as logging
@@ -36,18 +39,24 @@ from cloudkitty import storage_state as state
from cloudkitty import transformer from cloudkitty import transformer
from cloudkitty import utils as ck_utils from cloudkitty import utils as ck_utils
eventlet.monkey_patch()
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
CONF = cfg.CONF CONF = cfg.CONF
orchestrator_opts = [ orchestrator_opts = [
cfg.StrOpt('coordination_url', cfg.StrOpt(
'coordination_url',
secret=True, secret=True,
help='Coordination driver URL', help='Coordination driver URL',
default='file:///var/lib/cloudkitty/locks'), default='file:///var/lib/cloudkitty/locks'),
cfg.IntOpt(
'max_workers',
default=multiprocessing.cpu_count(),
min=1,
help='Max nb of workers to run. Defaults to the nb of available CPUs'),
] ]
CONF.register_opts(orchestrator_opts, group='orchestrator') CONF.register_opts(orchestrator_opts, group='orchestrator')
CONF.import_opt('backend', 'cloudkitty.fetcher', 'fetcher') CONF.import_opt('backend', 'cloudkitty.fetcher', 'fetcher')
@@ -152,12 +161,13 @@ class APIWorker(BaseWorker):
class Worker(BaseWorker): class Worker(BaseWorker):
def __init__(self, collector, storage, tenant_id): def __init__(self, collector, storage, tenant_id, worker_id):
self._collector = collector self._collector = collector
self._storage = storage self._storage = storage
self._period = CONF.collect.period self._period = CONF.collect.period
self._wait_time = CONF.collect.wait_periods * self._period self._wait_time = CONF.collect.wait_periods * self._period
self._tenant_id = tenant_id self._tenant_id = tenant_id
self._worker_id = worker_id
self._conf = ck_utils.load_conf(CONF.collect.metrics_conf) self._conf = ck_utils.load_conf(CONF.collect.metrics_conf)
self._state = state.StateManager() self._state = state.StateManager()
@@ -201,25 +211,28 @@ class Worker(BaseWorker):
raise raise
except Exception as e: except Exception as e:
LOG.warning( LOG.warning(
'[%(scope_id)s] Error while collecting metric ' '[scope: {scope}, worker: {worker}] Error while'
'%(metric)s: %(error)s. Retrying on next ' 'collecting metric {metric}: {error}. Retrying on '
'collection cycle.', 'next collection cycle.'.format(
{ scope=self._tenant_id,
'scope_id': self._tenant_id, worker=self._worker_id,
'metric': metric, metric=metric,
'error': e, error=e,
}, ),
) )
# FIXME(peschk_l): here we just exit, and the # FIXME(peschk_l): here we just exit, and the
# collection will be retried during the next collect # collection will be retried during the next collect
# cycle. In the future, we should implement a retrying # cycle. In the future, we should implement a retrying
# system in workers # system in workers
return sys.exit(0)
except collector.NoDataCollected: except collector.NoDataCollected:
LOG.info( LOG.info(
'[{}] No data collected for metric {} ' '[scope: {scope}, worker: {worker}] No data collected '
'at timestamp {}'.format( 'for metric {metric} at timestamp {ts}'.format(
self._tenant_id, metric, ck_utils.ts2dt(timestamp)) scope=self._tenant_id,
worker=self._worker_id,
metric=metric,
ts=ck_utils.ts2dt(timestamp))
) )
else: else:
# Rating # Rating
@@ -236,8 +249,11 @@ class Worker(BaseWorker):
self._state.set_state(self._tenant_id, timestamp) self._state.set_state(self._tenant_id, timestamp)
class Orchestrator(object): class Orchestrator(cotyledon.Service):
def __init__(self): def __init__(self, worker_id):
self._worker_id = worker_id
super(Orchestrator, self).__init__(self._worker_id)
self.fetcher = driver.DriverManager( self.fetcher = driver.DriverManager(
FETCHERS_NAMESPACE, FETCHERS_NAMESPACE,
CONF.fetcher.backend, CONF.fetcher.backend,
@@ -287,11 +303,13 @@ class Orchestrator(object):
# pending_states = self._rating_endpoint.get_module_state() # pending_states = self._rating_endpoint.get_module_state()
pass pass
def process(self): def run(self):
LOG.debug('Started worker {}.'.format(self._worker_id))
while True: while True:
self.tenants = self.fetcher.get_tenants() self.tenants = self.fetcher.get_tenants()
random.shuffle(self.tenants) random.shuffle(self.tenants)
LOG.info('Tenants loaded for fetcher %s', self.fetcher.name) LOG.info('[Worker: {w}] Tenants loaded for fetcher {f}'.format(
w=self._worker_id, f=self.fetcher.name))
for tenant_id in self.tenants: for tenant_id in self.tenants:
@@ -303,16 +321,23 @@ class Orchestrator(object):
self.collector, self.collector,
self.storage, self.storage,
tenant_id, tenant_id,
self._worker_id,
) )
worker.run() worker.run()
lock.release() lock.release()
# NOTE(sheeprine): Slow down looping if all tenants are
# being processed
eventlet.sleep(1)
# FIXME(sheeprine): We may cause a drift here # FIXME(sheeprine): We may cause a drift here
eventlet.sleep(CONF.collect.period) time.sleep(CONF.collect.period)
def terminate(self): def terminate(self):
LOG.debug('Terminating worker {}...'.format(self._worker_id))
self.coord.stop() self.coord.stop()
LOG.debug('Terminated worker {}.'.format(self._worker_id))
class OrchestratorServiceManager(cotyledon.ServiceManager):
def __init__(self):
super(OrchestratorServiceManager, self).__init__()
self.service_id = self.add(Orchestrator, workers=4)

View File

@@ -30,6 +30,7 @@ voluptuous==0.11.1 # BSD-3
influxdb==5.1.0 # MIT influxdb==5.1.0 # MIT
Flask==1.0.2 # BSD Flask==1.0.2 # BSD
Flask-RESTful==0.3.5 # BSD Flask-RESTful==0.3.5 # BSD
cotyledon==1.5.0 # Apache-2.0
# test-requirements # test-requirements
coverage==3.6 # Apache-2.0 coverage==3.6 # Apache-2.0

View File

@@ -0,0 +1,6 @@
---
features:
- |
The processor is now able to run several parallel workers. By default, one
worker is spawned for each available CPU. Workers can be limited through the
``max_workers`` option of the ``orchestrator`` section.

View File

@@ -33,3 +33,4 @@ influxdb>=5.1.0,!=5.2.0,!=5.2.1,!=5.2.2;python_version<'3.0' # MIT
influxdb>=5.1.0;python_version>='3.0' # MIT influxdb>=5.1.0;python_version>='3.0' # MIT
Flask>=1.0.2 # BSD Flask>=1.0.2 # BSD
Flask-RESTful>=0.3.5 # BSD Flask-RESTful>=0.3.5 # BSD
cotyledon>=1.5.0 # Apache-2.0