Make cloudkitty-processor run several workers
This makes the processor run several workers in separate processes by default, leading to a big performance increase. The maximal number of workers can be configured. Work items: * Use cotyledon to spawn several workers. * Remove eventlet monkey patching and use the "threading" executor for messaging in order to avoid conflict with cotyledon internal threads. Story: 2005423 Task: 30447 Change-Id: I7e2a77cb7d68afb87274fb44fb208306c3b32473
This commit is contained in:
@@ -27,11 +27,7 @@ def main():
|
|||||||
# before the prepare_service(), making cfg.CONF returning default values
|
# before the prepare_service(), making cfg.CONF returning default values
|
||||||
# systematically.
|
# systematically.
|
||||||
from cloudkitty import orchestrator
|
from cloudkitty import orchestrator
|
||||||
processor = orchestrator.Orchestrator()
|
orchestrator.OrchestratorServiceManager().run()
|
||||||
try:
|
|
||||||
processor.process()
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
processor.terminate()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@@ -66,7 +66,7 @@ def get_server(target=None, endpoints=None):
|
|||||||
if not target:
|
if not target:
|
||||||
target = get_target()
|
target = get_target()
|
||||||
return oslo_messaging.get_rpc_server(transport, target,
|
return oslo_messaging.get_rpc_server(transport, target,
|
||||||
endpoints, executor='eventlet',
|
endpoints, executor='threading',
|
||||||
access_policy=access_policy)
|
access_policy=access_policy)
|
||||||
|
|
||||||
|
|
||||||
|
@@ -16,9 +16,12 @@
|
|||||||
# @author: Stéphane Albert
|
# @author: Stéphane Albert
|
||||||
#
|
#
|
||||||
import decimal
|
import decimal
|
||||||
|
import multiprocessing
|
||||||
import random
|
import random
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
import eventlet
|
import cotyledon
|
||||||
from oslo_concurrency import lockutils
|
from oslo_concurrency import lockutils
|
||||||
from oslo_config import cfg
|
from oslo_config import cfg
|
||||||
from oslo_log import log as logging
|
from oslo_log import log as logging
|
||||||
@@ -36,18 +39,24 @@ from cloudkitty import storage_state as state
|
|||||||
from cloudkitty import transformer
|
from cloudkitty import transformer
|
||||||
from cloudkitty import utils as ck_utils
|
from cloudkitty import utils as ck_utils
|
||||||
|
|
||||||
eventlet.monkey_patch()
|
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
CONF = cfg.CONF
|
CONF = cfg.CONF
|
||||||
|
|
||||||
orchestrator_opts = [
|
orchestrator_opts = [
|
||||||
cfg.StrOpt('coordination_url',
|
cfg.StrOpt(
|
||||||
|
'coordination_url',
|
||||||
secret=True,
|
secret=True,
|
||||||
help='Coordination driver URL',
|
help='Coordination driver URL',
|
||||||
default='file:///var/lib/cloudkitty/locks'),
|
default='file:///var/lib/cloudkitty/locks'),
|
||||||
|
cfg.IntOpt(
|
||||||
|
'max_workers',
|
||||||
|
default=multiprocessing.cpu_count(),
|
||||||
|
min=1,
|
||||||
|
help='Max nb of workers to run. Defaults to the nb of available CPUs'),
|
||||||
]
|
]
|
||||||
|
|
||||||
CONF.register_opts(orchestrator_opts, group='orchestrator')
|
CONF.register_opts(orchestrator_opts, group='orchestrator')
|
||||||
|
|
||||||
CONF.import_opt('backend', 'cloudkitty.fetcher', 'fetcher')
|
CONF.import_opt('backend', 'cloudkitty.fetcher', 'fetcher')
|
||||||
@@ -152,12 +161,13 @@ class APIWorker(BaseWorker):
|
|||||||
|
|
||||||
|
|
||||||
class Worker(BaseWorker):
|
class Worker(BaseWorker):
|
||||||
def __init__(self, collector, storage, tenant_id):
|
def __init__(self, collector, storage, tenant_id, worker_id):
|
||||||
self._collector = collector
|
self._collector = collector
|
||||||
self._storage = storage
|
self._storage = storage
|
||||||
self._period = CONF.collect.period
|
self._period = CONF.collect.period
|
||||||
self._wait_time = CONF.collect.wait_periods * self._period
|
self._wait_time = CONF.collect.wait_periods * self._period
|
||||||
self._tenant_id = tenant_id
|
self._tenant_id = tenant_id
|
||||||
|
self._worker_id = worker_id
|
||||||
self._conf = ck_utils.load_conf(CONF.collect.metrics_conf)
|
self._conf = ck_utils.load_conf(CONF.collect.metrics_conf)
|
||||||
self._state = state.StateManager()
|
self._state = state.StateManager()
|
||||||
|
|
||||||
@@ -201,25 +211,28 @@ class Worker(BaseWorker):
|
|||||||
raise
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
LOG.warning(
|
LOG.warning(
|
||||||
'[%(scope_id)s] Error while collecting metric '
|
'[scope: {scope}, worker: {worker}] Error while'
|
||||||
'%(metric)s: %(error)s. Retrying on next '
|
'collecting metric {metric}: {error}. Retrying on '
|
||||||
'collection cycle.',
|
'next collection cycle.'.format(
|
||||||
{
|
scope=self._tenant_id,
|
||||||
'scope_id': self._tenant_id,
|
worker=self._worker_id,
|
||||||
'metric': metric,
|
metric=metric,
|
||||||
'error': e,
|
error=e,
|
||||||
},
|
),
|
||||||
)
|
)
|
||||||
# FIXME(peschk_l): here we just exit, and the
|
# FIXME(peschk_l): here we just exit, and the
|
||||||
# collection will be retried during the next collect
|
# collection will be retried during the next collect
|
||||||
# cycle. In the future, we should implement a retrying
|
# cycle. In the future, we should implement a retrying
|
||||||
# system in workers
|
# system in workers
|
||||||
return
|
sys.exit(0)
|
||||||
except collector.NoDataCollected:
|
except collector.NoDataCollected:
|
||||||
LOG.info(
|
LOG.info(
|
||||||
'[{}] No data collected for metric {} '
|
'[scope: {scope}, worker: {worker}] No data collected '
|
||||||
'at timestamp {}'.format(
|
'for metric {metric} at timestamp {ts}'.format(
|
||||||
self._tenant_id, metric, ck_utils.ts2dt(timestamp))
|
scope=self._tenant_id,
|
||||||
|
worker=self._worker_id,
|
||||||
|
metric=metric,
|
||||||
|
ts=ck_utils.ts2dt(timestamp))
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Rating
|
# Rating
|
||||||
@@ -236,8 +249,11 @@ class Worker(BaseWorker):
|
|||||||
self._state.set_state(self._tenant_id, timestamp)
|
self._state.set_state(self._tenant_id, timestamp)
|
||||||
|
|
||||||
|
|
||||||
class Orchestrator(object):
|
class Orchestrator(cotyledon.Service):
|
||||||
def __init__(self):
|
def __init__(self, worker_id):
|
||||||
|
self._worker_id = worker_id
|
||||||
|
super(Orchestrator, self).__init__(self._worker_id)
|
||||||
|
|
||||||
self.fetcher = driver.DriverManager(
|
self.fetcher = driver.DriverManager(
|
||||||
FETCHERS_NAMESPACE,
|
FETCHERS_NAMESPACE,
|
||||||
CONF.fetcher.backend,
|
CONF.fetcher.backend,
|
||||||
@@ -287,11 +303,13 @@ class Orchestrator(object):
|
|||||||
# pending_states = self._rating_endpoint.get_module_state()
|
# pending_states = self._rating_endpoint.get_module_state()
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def process(self):
|
def run(self):
|
||||||
|
LOG.debug('Started worker {}.'.format(self._worker_id))
|
||||||
while True:
|
while True:
|
||||||
self.tenants = self.fetcher.get_tenants()
|
self.tenants = self.fetcher.get_tenants()
|
||||||
random.shuffle(self.tenants)
|
random.shuffle(self.tenants)
|
||||||
LOG.info('Tenants loaded for fetcher %s', self.fetcher.name)
|
LOG.info('[Worker: {w}] Tenants loaded for fetcher {f}'.format(
|
||||||
|
w=self._worker_id, f=self.fetcher.name))
|
||||||
|
|
||||||
for tenant_id in self.tenants:
|
for tenant_id in self.tenants:
|
||||||
|
|
||||||
@@ -303,16 +321,23 @@ class Orchestrator(object):
|
|||||||
self.collector,
|
self.collector,
|
||||||
self.storage,
|
self.storage,
|
||||||
tenant_id,
|
tenant_id,
|
||||||
|
self._worker_id,
|
||||||
)
|
)
|
||||||
worker.run()
|
worker.run()
|
||||||
|
|
||||||
lock.release()
|
lock.release()
|
||||||
|
|
||||||
# NOTE(sheeprine): Slow down looping if all tenants are
|
|
||||||
# being processed
|
|
||||||
eventlet.sleep(1)
|
|
||||||
# FIXME(sheeprine): We may cause a drift here
|
# FIXME(sheeprine): We may cause a drift here
|
||||||
eventlet.sleep(CONF.collect.period)
|
time.sleep(CONF.collect.period)
|
||||||
|
|
||||||
def terminate(self):
|
def terminate(self):
|
||||||
|
LOG.debug('Terminating worker {}...'.format(self._worker_id))
|
||||||
self.coord.stop()
|
self.coord.stop()
|
||||||
|
LOG.debug('Terminated worker {}.'.format(self._worker_id))
|
||||||
|
|
||||||
|
|
||||||
|
class OrchestratorServiceManager(cotyledon.ServiceManager):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super(OrchestratorServiceManager, self).__init__()
|
||||||
|
self.service_id = self.add(Orchestrator, workers=4)
|
||||||
|
@@ -30,6 +30,7 @@ voluptuous==0.11.1 # BSD-3
|
|||||||
influxdb==5.1.0 # MIT
|
influxdb==5.1.0 # MIT
|
||||||
Flask==1.0.2 # BSD
|
Flask==1.0.2 # BSD
|
||||||
Flask-RESTful==0.3.5 # BSD
|
Flask-RESTful==0.3.5 # BSD
|
||||||
|
cotyledon==1.5.0 # Apache-2.0
|
||||||
|
|
||||||
# test-requirements
|
# test-requirements
|
||||||
coverage==3.6 # Apache-2.0
|
coverage==3.6 # Apache-2.0
|
||||||
|
@@ -0,0 +1,6 @@
|
|||||||
|
---
|
||||||
|
features:
|
||||||
|
- |
|
||||||
|
The processor is now able to run several parallel workers. By default, one
|
||||||
|
worker is spawned for each available CPU. Workers can be limited through the
|
||||||
|
``max_workers`` option of the ``orchestrator`` section.
|
@@ -33,3 +33,4 @@ influxdb>=5.1.0,!=5.2.0,!=5.2.1,!=5.2.2;python_version<'3.0' # MIT
|
|||||||
influxdb>=5.1.0;python_version>='3.0' # MIT
|
influxdb>=5.1.0;python_version>='3.0' # MIT
|
||||||
Flask>=1.0.2 # BSD
|
Flask>=1.0.2 # BSD
|
||||||
Flask-RESTful>=0.3.5 # BSD
|
Flask-RESTful>=0.3.5 # BSD
|
||||||
|
cotyledon>=1.5.0 # Apache-2.0
|
||||||
|
Reference in New Issue
Block a user