cloudkitty/cloudkitty/orchestrator.py

# Copyright 2014 Objectif Libre
#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.
#
from datetime import timedelta
import decimal
import functools
import hashlib
import multiprocessing
import random
import sys
import time

import cotyledon
import futurist
from futurist import waiters
from oslo_concurrency import lockutils
from oslo_config import cfg
from oslo_log import log as logging
import oslo_messaging
from oslo_utils import uuidutils
from stevedore import driver
from tooz import coordination

from cloudkitty import collector
from cloudkitty import config  # noqa
from cloudkitty import dataframe
from cloudkitty import extension_manager
from cloudkitty import messaging
from cloudkitty import storage
from cloudkitty import storage_state as state
from cloudkitty import utils as ck_utils
from cloudkitty.utils import tz as tzutils


LOG = logging.getLogger(__name__)

CONF = cfg.CONF

orchestrator_opts = [
    cfg.StrOpt(
        'coordination_url',
        secret=True,
        help='Coordination driver URL',
        default='file:///var/lib/cloudkitty/locks'),
    cfg.IntOpt(
        'max_workers',
        default=multiprocessing.cpu_count(),
        sample_default=4,
        min=1,
        help='Max nb of workers to run. Defaults to the nb of available CPUs'),
    cfg.IntOpt('max_threads',
               # NOTE(peschk_l): This is the futurist default
               default=multiprocessing.cpu_count() * 5,
               sample_default=20,
               min=1,
               deprecated_name='max_greenthreads',
               advanced=True,
               help='Maximal number of threads to use per worker. Defaults to '
               '5 times the nb of available CPUs'),
]

CONF.register_opts(orchestrator_opts, group='orchestrator')

CONF.import_opt('backend', 'cloudkitty.fetcher', 'fetcher')

FETCHERS_NAMESPACE = 'cloudkitty.fetchers'
PROCESSORS_NAMESPACE = 'cloudkitty.rating.processors'
COLLECTORS_NAMESPACE = 'cloudkitty.collector.backends'
STORAGES_NAMESPACE = 'cloudkitty.storage.backends'


def get_lock(coord, tenant_id):
    name = hashlib.sha256(
        ("cloudkitty-"
         + str(tenant_id + '-')
         + str(CONF.collect.collector + '-')
         + str(CONF.fetcher.backend + '-')
         + str(CONF.collect.scope_key)).encode('ascii')).hexdigest()
    return name, coord.get_lock(name.encode('ascii'))


class RatingEndpoint(object):
    target = oslo_messaging.Target(namespace='rating',
                                   version='1.0')

    def __init__(self, orchestrator):
        self._global_reload = False
        self._pending_reload = []
        self._module_state = {}
        self._orchestrator = orchestrator

    def get_reload_list(self):
        lock = lockutils.lock('module-reload')
        with lock:
            reload_list = self._pending_reload
            self._pending_reload = []
            return reload_list

    def get_module_state(self):
        lock = lockutils.lock('module-state')
        with lock:
            module_list = self._module_state
            self._module_state = {}
            return module_list

    def quote(self, ctxt, res_data):
        LOG.debug('Received quote request [%s] from RPC.', res_data)
        worker = APIWorker()

        start = tzutils.localized_now()
        end = tzutils.add_delta(start, timedelta(seconds=CONF.collect.period))

        # Need to prepare data to support the V2 processing format
        usage = {}
        for k in res_data['usage']:
            all_data_points_for_metric = []
            all_quote_data_entries = res_data['usage'][k]
            for p in all_quote_data_entries:
                vol = p['vol']
                desc = p.get('desc', {})

                data_point = dataframe.DataPoint(
                    vol['unit'],
                    vol['qty'],
                    0,
                    desc.get('groupby', []),
                    desc.get('metadata', []),
                )
                all_data_points_for_metric.append(data_point)
            usage[k] = all_data_points_for_metric

        frame = dataframe.DataFrame(
            start=start,
            end=end,
            usage=usage,
        )

        quote_result = worker.quote(frame)
        LOG.debug("Quote result [%s] for input data [%s].",
                  quote_result, res_data)
        return str(quote_result)

    def reload_modules(self, ctxt):
        LOG.info('Received reload modules command.')
        lock = lockutils.lock('module-reload')
        with lock:
            self._global_reload = True

    def reload_module(self, ctxt, name):
        LOG.info('Received reload command for module %s.', name)
        lock = lockutils.lock('module-reload')
        with lock:
            if name not in self._pending_reload:
                self._pending_reload.append(name)

    def enable_module(self, ctxt, name):
        LOG.info('Received enable command for module %s.', name)
        lock = lockutils.lock('module-state')
        with lock:
            self._module_state[name] = True

    def disable_module(self, ctxt, name):
        LOG.info('Received disable command for module %s.', name)
        lock = lockutils.lock('module-state')
        with lock:
            self._module_state[name] = False
            if name in self._pending_reload:
                self._pending_reload.remove(name)


class ScopeEndpoint(object):
    target = oslo_messaging.Target(version='1.0')

    def __init__(self):
        self._coord = coordination.get_coordinator(
            CONF.orchestrator.coordination_url,
            uuidutils.generate_uuid().encode('ascii'))
        self._state = state.StateManager()
        self._storage = storage.get_storage()
        self._coord.start(start_heart=True)

    def reset_state(self, ctxt, res_data):
        LOG.info('Received state reset command. {}'.format(res_data))
        random.shuffle(res_data['scopes'])
        for scope in res_data['scopes']:
            lock_name, lock = get_lock(self._coord, scope['scope_id'])
            LOG.debug(
                '[ScopeEndpoint] Trying to acquire lock "{}" ...'.format(
                    lock_name,
                )
            )
            if lock.acquire(blocking=True):
                LOG.debug(
                    '[ScopeEndpoint] Acquired lock "{}".'.format(
                        lock_name,
                    )
                )
                state_dt = tzutils.dt_from_iso(res_data['state'])
                try:
                    self._storage.delete(begin=state_dt, end=None, filters={
                        scope['scope_key']: scope['scope_id'],
                    })
                    self._state.set_state(
                        scope['scope_id'],
                        state_dt,
                        fetcher=scope['fetcher'],
                        collector=scope['collector'],
                        scope_key=scope['scope_key'],
                    )
                finally:
                    lock.release()
                    LOG.debug(
                        '[ScopeEndpoint] Released lock "{}" .'.format(
                            lock_name,
                        )
                    )


class BaseWorker(object):
    def __init__(self, tenant_id=None):
        self._tenant_id = tenant_id

        # Rating processors
        self._processors = []
        self._load_rating_processors()

    def _load_rating_processors(self):
        self._processors = []
        processors = extension_manager.EnabledExtensionManager(
            PROCESSORS_NAMESPACE,
            invoke_kwds={'tenant_id': self._tenant_id})

        for processor in processors:
            self._processors.append(processor)
        self._processors.sort(key=lambda x: x.obj.priority, reverse=True)


class APIWorker(BaseWorker):
    def __init__(self, tenant_id=None):
        super(APIWorker, self).__init__(tenant_id)

    def quote(self, res_data):
        quote_result = res_data
        for processor in self._processors:
            quote_result = processor.obj.quote(quote_result)

        price = decimal.Decimal(0)
        for _, point in quote_result.iterpoints():
            price += point.price
        return price


def _check_state(obj, period, tenant_id):
    timestamp = obj._state.get_state(tenant_id)
    return ck_utils.check_time_state(timestamp,
                                     period,
                                     CONF.collect.wait_periods)


class Worker(BaseWorker):
    def __init__(self, collector, storage, tenant_id, worker_id):
        self._collector = collector
        self._storage = storage
        self._period = CONF.collect.period
        self._wait_time = CONF.collect.wait_periods * self._period
        self._tenant_id = tenant_id
        self._worker_id = worker_id
        self._log_prefix = '[scope: {scope}, worker: {worker}] '.format(
            scope=self._tenant_id, worker=self._worker_id)
        self._conf = ck_utils.load_conf(CONF.collect.metrics_conf)
        self._state = state.StateManager()
        self._check_state = functools.partial(
            _check_state, self, self._period, self._tenant_id)

        super(Worker, self).__init__(self._tenant_id)

    def _collect(self, metric, start_timestamp):
        next_timestamp = tzutils.add_delta(
            start_timestamp, timedelta(seconds=self._period))

        name, data = self._collector.retrieve(
            metric,
            start_timestamp,
            next_timestamp,
            self._tenant_id,
        )
        if not data:
            raise collector.NoDataCollected

        return name, data

    def _do_collection(self, metrics, timestamp):

        def _get_result(metric):
            try:
                return self._collect(metric, timestamp)
            except collector.NoDataCollected:
                LOG.info(
                    self._log_prefix + 'No data collected '
                    'for metric {metric} at timestamp {ts}'.format(
                        metric=metric, ts=timestamp))
                return metric, None
            except Exception as e:
                LOG.exception(
                    self._log_prefix + 'Error while collecting'
                    ' metric {metric} at timestamp {ts}: {e}. Exiting.'.format(
                        metric=metric, ts=timestamp, e=e))
                # FIXME(peschk_l): here we just exit, and the
                # collection will be retried during the next collect
                # cycle. In the future, we should implement a retrying
                # system in workers
                sys.exit(1)

        with futurist.ThreadPoolExecutor(
                max_workers=CONF.orchestrator.max_threads) as tpool:
            futs = [tpool.submit(_get_result, metric) for metric in metrics]
            LOG.debug(self._log_prefix +
                      'Collecting {} metrics.'.format(len(metrics)))
            results = [r.result() for r in waiters.wait_for_all(futs).done]
            LOG.debug(self._log_prefix + 'Collecting {} metrics took {}s '
                      'total, with {}s average'.format(
                          tpool.statistics.executed,
                          tpool.statistics.runtime,
                          tpool.statistics.average_runtime))
        return dict(filter(lambda x: x[1] is not None, results))

    def run(self):
        while True:
            timestamp = self._check_state()
            if not timestamp:
                break

            metrics = list(self._conf['metrics'].keys())

            # Collection
            usage_data = self._do_collection(metrics, timestamp)

            frame = dataframe.DataFrame(
                start=timestamp,
                end=tzutils.add_delta(timestamp,
                                      timedelta(seconds=self._period)),
                usage=usage_data,
            )
            # Rating
            for processor in self._processors:
                frame = processor.obj.process(frame)

            # Writing
            self._storage.push([frame], self._tenant_id)
            self._state.set_state(self._tenant_id, timestamp)


class Orchestrator(cotyledon.Service):
    def __init__(self, worker_id):
        self._worker_id = worker_id
        super(Orchestrator, self).__init__(self._worker_id)

        self.fetcher = driver.DriverManager(
            FETCHERS_NAMESPACE,
            CONF.fetcher.backend,
            invoke_on_load=True,
        ).driver

        self.collector = collector.get_collector()
        self.storage = storage.get_storage()
        self._state = state.StateManager()

        # RPC
        self.server = None
        self._rating_endpoint = RatingEndpoint(self)
        self._scope_endpoint = ScopeEndpoint()
        self._init_messaging()

        # DLM
        self.coord = coordination.get_coordinator(
            CONF.orchestrator.coordination_url,
            uuidutils.generate_uuid().encode('ascii'))
        self.coord.start(start_heart=True)
        self._check_state = functools.partial(
            _check_state, self, CONF.collect.period)

    def _init_messaging(self):
        target = oslo_messaging.Target(topic='cloudkitty',
                                       server=CONF.host,
                                       version='1.0')
        endpoints = [
            self._rating_endpoint,
            self._scope_endpoint,
        ]
        self.server = messaging.get_server(target, endpoints)
        self.server.start()

    def process_messages(self):
        # TODO(sheeprine): Code kept to handle threading and asynchronous
        # reloading
        # pending_reload = self._rating_endpoint.get_reload_list()
        # pending_states = self._rating_endpoint.get_module_state()
        pass

    def run(self):
        LOG.debug('Started worker {}.'.format(self._worker_id))
        while True:
            self.tenants = self.fetcher.get_tenants()
            random.shuffle(self.tenants)
            LOG.info('[Worker: {w}] Tenants loaded for fetcher {f}'.format(
                w=self._worker_id, f=self.fetcher.name))

            for tenant_id in self.tenants:

                lock_name, lock = get_lock(self.coord, tenant_id)
                LOG.debug(
                    '[Worker: {w}] Trying to acquire lock "{l}" ...'.format(
                        w=self._worker_id, l=lock_name)
                )
                if lock.acquire(blocking=False):
                    LOG.debug(
                        '[Worker: {w}] Acquired lock "{l}" ...'.format(
                            w=self._worker_id, l=lock_name)
                    )
                    state = self._check_state(tenant_id)
                    if state:
                        worker = Worker(
                            self.collector,
                            self.storage,
                            tenant_id,
                            self._worker_id,
                        )
                        worker.run()

                    lock.release()

            # FIXME(sheeprine): We may cause a drift here
            time.sleep(CONF.collect.period)

    def terminate(self):
        LOG.debug('Terminating worker {}...'.format(self._worker_id))
        self.coord.stop()
        LOG.debug('Terminated worker {}.'.format(self._worker_id))


class OrchestratorServiceManager(cotyledon.ServiceManager):

    def __init__(self):
        super(OrchestratorServiceManager, self).__init__()
        self.service_id = self.add(Orchestrator,
                                   workers=CONF.orchestrator.max_workers)