Make cloudkitty-processor run several workers

This makes the processor run several workers in separate processes by
default, leading to a big performance increase. The maximal number of workers
can be configured. Work items:

* Use cotyledon to spawn several workers.

* Remove eventlet monkey patching and use the "threading" executor for
  messaging in order to avoid conflict with cotyledon internal threads.

Story: 2005423
Task: 30447
Change-Id: I7e2a77cb7d68afb87274fb44fb208306c3b32473
This commit is contained in:
Luka Peschke 2019-04-12 10:00:54 +02:00
parent 394177fb74
commit 1e60561ad7
6 changed files with 62 additions and 33 deletions

View File

@ -27,11 +27,7 @@ def main():
# before the prepare_service(), making cfg.CONF returning default values
# systematically.
from cloudkitty import orchestrator
processor = orchestrator.Orchestrator()
try:
processor.process()
except KeyboardInterrupt:
processor.terminate()
orchestrator.OrchestratorServiceManager().run()
if __name__ == '__main__':

View File

@ -66,7 +66,7 @@ def get_server(target=None, endpoints=None):
if not target:
target = get_target()
return oslo_messaging.get_rpc_server(transport, target,
endpoints, executor='eventlet',
endpoints, executor='threading',
access_policy=access_policy)

View File

@ -16,9 +16,12 @@
# @author: Stéphane Albert
#
import decimal
import multiprocessing
import random
import sys
import time
import eventlet
import cotyledon
from oslo_concurrency import lockutils
from oslo_config import cfg
from oslo_log import log as logging
@ -36,18 +39,24 @@ from cloudkitty import storage_state as state
from cloudkitty import transformer
from cloudkitty import utils as ck_utils
eventlet.monkey_patch()
LOG = logging.getLogger(__name__)
CONF = cfg.CONF
orchestrator_opts = [
cfg.StrOpt('coordination_url',
secret=True,
help='Coordination driver URL',
default='file:///var/lib/cloudkitty/locks'),
cfg.StrOpt(
'coordination_url',
secret=True,
help='Coordination driver URL',
default='file:///var/lib/cloudkitty/locks'),
cfg.IntOpt(
'max_workers',
default=multiprocessing.cpu_count(),
min=1,
help='Max nb of workers to run. Defaults to the nb of available CPUs'),
]
CONF.register_opts(orchestrator_opts, group='orchestrator')
CONF.import_opt('backend', 'cloudkitty.fetcher', 'fetcher')
@ -152,12 +161,13 @@ class APIWorker(BaseWorker):
class Worker(BaseWorker):
def __init__(self, collector, storage, tenant_id):
def __init__(self, collector, storage, tenant_id, worker_id):
self._collector = collector
self._storage = storage
self._period = CONF.collect.period
self._wait_time = CONF.collect.wait_periods * self._period
self._tenant_id = tenant_id
self._worker_id = worker_id
self._conf = ck_utils.load_conf(CONF.collect.metrics_conf)
self._state = state.StateManager()
@ -201,25 +211,28 @@ class Worker(BaseWorker):
raise
except Exception as e:
LOG.warning(
'[%(scope_id)s] Error while collecting metric '
'%(metric)s: %(error)s. Retrying on next '
'collection cycle.',
{
'scope_id': self._tenant_id,
'metric': metric,
'error': e,
},
'[scope: {scope}, worker: {worker}] Error while'
'collecting metric {metric}: {error}. Retrying on '
'next collection cycle.'.format(
scope=self._tenant_id,
worker=self._worker_id,
metric=metric,
error=e,
),
)
# FIXME(peschk_l): here we just exit, and the
# collection will be retried during the next collect
# cycle. In the future, we should implement a retrying
# system in workers
return
sys.exit(0)
except collector.NoDataCollected:
LOG.info(
'[{}] No data collected for metric {} '
'at timestamp {}'.format(
self._tenant_id, metric, ck_utils.ts2dt(timestamp))
'[scope: {scope}, worker: {worker}] No data collected '
'for metric {metric} at timestamp {ts}'.format(
scope=self._tenant_id,
worker=self._worker_id,
metric=metric,
ts=ck_utils.ts2dt(timestamp))
)
else:
# Rating
@ -236,8 +249,11 @@ class Worker(BaseWorker):
self._state.set_state(self._tenant_id, timestamp)
class Orchestrator(object):
def __init__(self):
class Orchestrator(cotyledon.Service):
def __init__(self, worker_id):
self._worker_id = worker_id
super(Orchestrator, self).__init__(self._worker_id)
self.fetcher = driver.DriverManager(
FETCHERS_NAMESPACE,
CONF.fetcher.backend,
@ -287,11 +303,13 @@ class Orchestrator(object):
# pending_states = self._rating_endpoint.get_module_state()
pass
def process(self):
def run(self):
LOG.debug('Started worker {}.'.format(self._worker_id))
while True:
self.tenants = self.fetcher.get_tenants()
random.shuffle(self.tenants)
LOG.info('Tenants loaded for fetcher %s', self.fetcher.name)
LOG.info('[Worker: {w}] Tenants loaded for fetcher {f}'.format(
w=self._worker_id, f=self.fetcher.name))
for tenant_id in self.tenants:
@ -303,16 +321,23 @@ class Orchestrator(object):
self.collector,
self.storage,
tenant_id,
self._worker_id,
)
worker.run()
lock.release()
# NOTE(sheeprine): Slow down looping if all tenants are
# being processed
eventlet.sleep(1)
# FIXME(sheeprine): We may cause a drift here
eventlet.sleep(CONF.collect.period)
time.sleep(CONF.collect.period)
def terminate(self):
LOG.debug('Terminating worker {}...'.format(self._worker_id))
self.coord.stop()
LOG.debug('Terminated worker {}.'.format(self._worker_id))
class OrchestratorServiceManager(cotyledon.ServiceManager):
def __init__(self):
super(OrchestratorServiceManager, self).__init__()
self.service_id = self.add(Orchestrator, workers=4)

View File

@ -30,6 +30,7 @@ voluptuous==0.11.1 # BSD-3
influxdb==5.1.0 # MIT
Flask==1.0.2 # BSD
Flask-RESTful==0.3.5 # BSD
cotyledon==1.5.0 # Apache-2.0
# test-requirements
coverage==3.6 # Apache-2.0

View File

@ -0,0 +1,6 @@
---
features:
- |
The processor is now able to run several parallel workers. By default, one
worker is spawned for each available CPU. Workers can be limited through the
``max_workers`` option of the ``orchestrator`` section.

View File

@ -33,3 +33,4 @@ influxdb>=5.1.0,!=5.2.0,!=5.2.1,!=5.2.2;python_version<'3.0' # MIT
influxdb>=5.1.0;python_version>='3.0' # MIT
Flask>=1.0.2 # BSD
Flask-RESTful>=0.3.5 # BSD
cotyledon>=1.5.0 # Apache-2.0