cloudkitty/cloudkitty/orchestrator.py

458 lines
16 KiB
Python

# Copyright 2014 Objectif Libre
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
from datetime import timedelta
import decimal
import functools
import hashlib
import multiprocessing
import random
import sys
import time
import cotyledon
import futurist
from futurist import waiters
from oslo_concurrency import lockutils
from oslo_config import cfg
from oslo_log import log as logging
import oslo_messaging
from oslo_utils import uuidutils
from stevedore import driver
from tooz import coordination
from cloudkitty import collector
from cloudkitty import config # noqa
from cloudkitty import dataframe
from cloudkitty import extension_manager
from cloudkitty import messaging
from cloudkitty import storage
from cloudkitty import storage_state as state
from cloudkitty import utils as ck_utils
from cloudkitty.utils import tz as tzutils
LOG = logging.getLogger(__name__)
CONF = cfg.CONF
orchestrator_opts = [
cfg.StrOpt(
'coordination_url',
secret=True,
help='Coordination driver URL',
default='file:///var/lib/cloudkitty/locks'),
cfg.IntOpt(
'max_workers',
default=multiprocessing.cpu_count(),
sample_default=4,
min=1,
help='Max nb of workers to run. Defaults to the nb of available CPUs'),
cfg.IntOpt('max_threads',
# NOTE(peschk_l): This is the futurist default
default=multiprocessing.cpu_count() * 5,
sample_default=20,
min=1,
deprecated_name='max_greenthreads',
advanced=True,
help='Maximal number of threads to use per worker. Defaults to '
'5 times the nb of available CPUs'),
]
CONF.register_opts(orchestrator_opts, group='orchestrator')
CONF.import_opt('backend', 'cloudkitty.fetcher', 'fetcher')
FETCHERS_NAMESPACE = 'cloudkitty.fetchers'
PROCESSORS_NAMESPACE = 'cloudkitty.rating.processors'
COLLECTORS_NAMESPACE = 'cloudkitty.collector.backends'
STORAGES_NAMESPACE = 'cloudkitty.storage.backends'
def get_lock(coord, tenant_id):
name = hashlib.sha256(
("cloudkitty-"
+ str(tenant_id + '-')
+ str(CONF.collect.collector + '-')
+ str(CONF.fetcher.backend + '-')
+ str(CONF.collect.scope_key)).encode('ascii')).hexdigest()
return name, coord.get_lock(name.encode('ascii'))
class RatingEndpoint(object):
target = oslo_messaging.Target(namespace='rating',
version='1.0')
def __init__(self, orchestrator):
self._global_reload = False
self._pending_reload = []
self._module_state = {}
self._orchestrator = orchestrator
def get_reload_list(self):
lock = lockutils.lock('module-reload')
with lock:
reload_list = self._pending_reload
self._pending_reload = []
return reload_list
def get_module_state(self):
lock = lockutils.lock('module-state')
with lock:
module_list = self._module_state
self._module_state = {}
return module_list
def quote(self, ctxt, res_data):
LOG.debug('Received quote request [%s] from RPC.', res_data)
worker = APIWorker()
start = tzutils.localized_now()
end = tzutils.add_delta(start, timedelta(seconds=CONF.collect.period))
# Need to prepare data to support the V2 processing format
usage = {}
for k in res_data['usage']:
all_data_points_for_metric = []
all_quote_data_entries = res_data['usage'][k]
for p in all_quote_data_entries:
vol = p['vol']
desc = p.get('desc', {})
data_point = dataframe.DataPoint(
vol['unit'],
vol['qty'],
0,
desc.get('groupby', []),
desc.get('metadata', []),
)
all_data_points_for_metric.append(data_point)
usage[k] = all_data_points_for_metric
frame = dataframe.DataFrame(
start=start,
end=end,
usage=usage,
)
quote_result = worker.quote(frame)
LOG.debug("Quote result [%s] for input data [%s].",
quote_result, res_data)
return str(quote_result)
def reload_modules(self, ctxt):
LOG.info('Received reload modules command.')
lock = lockutils.lock('module-reload')
with lock:
self._global_reload = True
def reload_module(self, ctxt, name):
LOG.info('Received reload command for module %s.', name)
lock = lockutils.lock('module-reload')
with lock:
if name not in self._pending_reload:
self._pending_reload.append(name)
def enable_module(self, ctxt, name):
LOG.info('Received enable command for module %s.', name)
lock = lockutils.lock('module-state')
with lock:
self._module_state[name] = True
def disable_module(self, ctxt, name):
LOG.info('Received disable command for module %s.', name)
lock = lockutils.lock('module-state')
with lock:
self._module_state[name] = False
if name in self._pending_reload:
self._pending_reload.remove(name)
class ScopeEndpoint(object):
target = oslo_messaging.Target(version='1.0')
def __init__(self):
self._coord = coordination.get_coordinator(
CONF.orchestrator.coordination_url,
uuidutils.generate_uuid().encode('ascii'))
self._state = state.StateManager()
self._storage = storage.get_storage()
self._coord.start(start_heart=True)
def reset_state(self, ctxt, res_data):
LOG.info('Received state reset command. {}'.format(res_data))
random.shuffle(res_data['scopes'])
for scope in res_data['scopes']:
lock_name, lock = get_lock(self._coord, scope['scope_id'])
LOG.debug(
'[ScopeEndpoint] Trying to acquire lock "{}" ...'.format(
lock_name,
)
)
if lock.acquire(blocking=True):
LOG.debug(
'[ScopeEndpoint] Acquired lock "{}".'.format(
lock_name,
)
)
state_dt = tzutils.dt_from_iso(res_data['state'])
try:
self._storage.delete(begin=state_dt, end=None, filters={
scope['scope_key']: scope['scope_id'],
})
self._state.set_state(
scope['scope_id'],
state_dt,
fetcher=scope['fetcher'],
collector=scope['collector'],
scope_key=scope['scope_key'],
)
finally:
lock.release()
LOG.debug(
'[ScopeEndpoint] Released lock "{}" .'.format(
lock_name,
)
)
class BaseWorker(object):
def __init__(self, tenant_id=None):
self._tenant_id = tenant_id
# Rating processors
self._processors = []
self._load_rating_processors()
def _load_rating_processors(self):
self._processors = []
processors = extension_manager.EnabledExtensionManager(
PROCESSORS_NAMESPACE,
invoke_kwds={'tenant_id': self._tenant_id})
for processor in processors:
self._processors.append(processor)
self._processors.sort(key=lambda x: x.obj.priority, reverse=True)
class APIWorker(BaseWorker):
def __init__(self, tenant_id=None):
super(APIWorker, self).__init__(tenant_id)
def quote(self, res_data):
quote_result = res_data
for processor in self._processors:
quote_result = processor.obj.quote(quote_result)
price = decimal.Decimal(0)
for _, point in quote_result.iterpoints():
price += point.price
return price
def _check_state(obj, period, tenant_id):
timestamp = obj._state.get_state(tenant_id)
return ck_utils.check_time_state(timestamp,
period,
CONF.collect.wait_periods)
class Worker(BaseWorker):
def __init__(self, collector, storage, tenant_id, worker_id):
self._collector = collector
self._storage = storage
self._period = CONF.collect.period
self._wait_time = CONF.collect.wait_periods * self._period
self._tenant_id = tenant_id
self._worker_id = worker_id
self._log_prefix = '[scope: {scope}, worker: {worker}] '.format(
scope=self._tenant_id, worker=self._worker_id)
self._conf = ck_utils.load_conf(CONF.collect.metrics_conf)
self._state = state.StateManager()
self._check_state = functools.partial(
_check_state, self, self._period, self._tenant_id)
super(Worker, self).__init__(self._tenant_id)
def _collect(self, metric, start_timestamp):
next_timestamp = tzutils.add_delta(
start_timestamp, timedelta(seconds=self._period))
name, data = self._collector.retrieve(
metric,
start_timestamp,
next_timestamp,
self._tenant_id,
)
if not data:
raise collector.NoDataCollected
return name, data
def _do_collection(self, metrics, timestamp):
def _get_result(metric):
try:
return self._collect(metric, timestamp)
except collector.NoDataCollected:
LOG.info(
self._log_prefix + 'No data collected '
'for metric {metric} at timestamp {ts}'.format(
metric=metric, ts=timestamp))
return metric, None
except Exception as e:
LOG.exception(
self._log_prefix + 'Error while collecting'
' metric {metric} at timestamp {ts}: {e}. Exiting.'.format(
metric=metric, ts=timestamp, e=e))
# FIXME(peschk_l): here we just exit, and the
# collection will be retried during the next collect
# cycle. In the future, we should implement a retrying
# system in workers
sys.exit(1)
with futurist.ThreadPoolExecutor(
max_workers=CONF.orchestrator.max_threads) as tpool:
futs = [tpool.submit(_get_result, metric) for metric in metrics]
LOG.debug(self._log_prefix +
'Collecting {} metrics.'.format(len(metrics)))
results = [r.result() for r in waiters.wait_for_all(futs).done]
LOG.debug(self._log_prefix + 'Collecting {} metrics took {}s '
'total, with {}s average'.format(
tpool.statistics.executed,
tpool.statistics.runtime,
tpool.statistics.average_runtime))
return dict(filter(lambda x: x[1] is not None, results))
def run(self):
while True:
timestamp = self._check_state()
if not timestamp:
break
metrics = list(self._conf['metrics'].keys())
# Collection
usage_data = self._do_collection(metrics, timestamp)
frame = dataframe.DataFrame(
start=timestamp,
end=tzutils.add_delta(timestamp,
timedelta(seconds=self._period)),
usage=usage_data,
)
# Rating
for processor in self._processors:
frame = processor.obj.process(frame)
# Writing
self._storage.push([frame], self._tenant_id)
self._state.set_state(self._tenant_id, timestamp)
class Orchestrator(cotyledon.Service):
def __init__(self, worker_id):
self._worker_id = worker_id
super(Orchestrator, self).__init__(self._worker_id)
self.fetcher = driver.DriverManager(
FETCHERS_NAMESPACE,
CONF.fetcher.backend,
invoke_on_load=True,
).driver
self.collector = collector.get_collector()
self.storage = storage.get_storage()
self._state = state.StateManager()
# RPC
self.server = None
self._rating_endpoint = RatingEndpoint(self)
self._scope_endpoint = ScopeEndpoint()
self._init_messaging()
# DLM
self.coord = coordination.get_coordinator(
CONF.orchestrator.coordination_url,
uuidutils.generate_uuid().encode('ascii'))
self.coord.start(start_heart=True)
self._check_state = functools.partial(
_check_state, self, CONF.collect.period)
def _init_messaging(self):
target = oslo_messaging.Target(topic='cloudkitty',
server=CONF.host,
version='1.0')
endpoints = [
self._rating_endpoint,
self._scope_endpoint,
]
self.server = messaging.get_server(target, endpoints)
self.server.start()
def process_messages(self):
# TODO(sheeprine): Code kept to handle threading and asynchronous
# reloading
# pending_reload = self._rating_endpoint.get_reload_list()
# pending_states = self._rating_endpoint.get_module_state()
pass
def run(self):
LOG.debug('Started worker {}.'.format(self._worker_id))
while True:
self.tenants = self.fetcher.get_tenants()
random.shuffle(self.tenants)
LOG.info('[Worker: {w}] Tenants loaded for fetcher {f}'.format(
w=self._worker_id, f=self.fetcher.name))
for tenant_id in self.tenants:
lock_name, lock = get_lock(self.coord, tenant_id)
LOG.debug(
'[Worker: {w}] Trying to acquire lock "{l}" ...'.format(
w=self._worker_id, l=lock_name)
)
if lock.acquire(blocking=False):
LOG.debug(
'[Worker: {w}] Acquired lock "{l}" ...'.format(
w=self._worker_id, l=lock_name)
)
state = self._check_state(tenant_id)
if state:
worker = Worker(
self.collector,
self.storage,
tenant_id,
self._worker_id,
)
worker.run()
lock.release()
# FIXME(sheeprine): We may cause a drift here
time.sleep(CONF.collect.period)
def terminate(self):
LOG.debug('Terminating worker {}...'.format(self._worker_id))
self.coord.stop()
LOG.debug('Terminated worker {}.'.format(self._worker_id))
class OrchestratorServiceManager(cotyledon.ServiceManager):
def __init__(self):
super(OrchestratorServiceManager, self).__init__()
self.service_id = self.add(Orchestrator,
workers=CONF.orchestrator.max_workers)