fuel-web/nailgun/nailgun/transactions/manager.py

# -*- coding: utf-8 -*-

#    Copyright 2016 Mirantis, Inc.
#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.

import itertools

import six

from nailgun import consts
from nailgun.db import db
from nailgun import errors
from nailgun import lcm
from nailgun.logger import logger
from nailgun import notifier
from nailgun import objects
from nailgun.objects.serializers import node as node_serializers
from nailgun.orchestrator import deployment_serializers
from nailgun import rpc
from nailgun.settings import settings
from nailgun.task import helpers
from nailgun.task import legacy_tasks_adapter
from nailgun.utils import dict_update
from nailgun.utils import get_in
from nailgun.utils import mule
from nailgun.utils import resolvers
from nailgun import yaql_ext


_DEFAULT_NODE_ATTRIBUTES = {
    'on_success': {'status': consts.NODE_STATUSES.ready},
    'on_error': {'status': consts.NODE_STATUSES.error},
    'on_stop': {'status': consts.NODE_STATUSES.stopped},
}

_DEFAULT_NODE_FILTER = (
    "not $.pending_addition and not $.pending_deletion and "
    "($.status in [ready, provisioned, stopped] or $.error_type = 'deploy')"
)


def _get_node_attributes(graph, kind):
    r = get_in(graph, kind, 'node_attributes')
    if r is None:
        r = _DEFAULT_NODE_ATTRIBUTES[kind]
    return r


def make_astute_message(transaction, context, graph, node_resolver):
    directory, tasks, metadata = lcm.TransactionSerializer.serialize(
        context, graph['tasks'], node_resolver
    )

    metadata['node_statuses_transitions'] = {
        'successful': _get_node_attributes(graph, 'on_success'),
        'failed': _get_node_attributes(graph, 'on_error'),
        'stopped': _get_node_attributes(graph, 'on_stop')
    }
    subgraphs = transaction.cache.get('subgraphs')
    if subgraphs:
        metadata['subgraphs'] = subgraphs
    objects.DeploymentHistoryCollection.create(transaction, tasks)

    return {
        'api_version': settings.VERSION['api'],
        'method': 'task_deploy',
        'respond_to': 'transaction_resp',
        'args': {
            'task_uuid': transaction.uuid,
            'tasks_directory': directory,
            'tasks_graph': tasks,
            'tasks_metadata': metadata,
            'dry_run': transaction.cache.get('dry_run'),
            'noop_run': transaction.cache.get('noop_run'),
            'debug': transaction.cache.get('debug'),
        }
    }


class try_transaction(object):
    """Wraps transaction in some sort of pre-/post- actions.

    So far it includes the following actions:

      * mark transaction as failed if exception has been raised;
      * create an action log record on start/finish;

    :param transaction: a transaction instance to be wrapped
    """

    def __init__(self, transaction, on_error):
        self._transaction = transaction
        self._on_error = on_error

    def __enter__(self):
        logger.debug("Transaction %s starts assembling.", self._transaction.id)
        return self._transaction

    def __exit__(self, exc_type, exc_val, exc_tb):
        if exc_val:
            logger.error(
                "Transaction %s failed.",
                self._transaction.id, exc_info=(exc_type, exc_val, exc_tb)
            )
            return self._on_error(self._transaction, six.text_type(exc_val))
        else:
            logger.debug(
                "Transaction %s finish assembling.", self._transaction.id
            )
        return False


class TransactionsManager(object):

    # We're moving towards everything-is-a-graph approach where there's
    # no place for transaction names. From now on we're going to use
    # transaction's attributes (e.g. graph_type, dry_run) to find out
    # what this transaction about. Still, we need to specify transaction
    # name until we move everything to graphs.
    task_name = consts.TASK_NAMES.deployment

    def __init__(self, cluster_id):
        self.cluster_id = cluster_id

    def execute(self, graphs, dry_run=False, noop_run=False, force=False,
                debug=False, subgraphs=None):
        """Start a new transaction with a given parameters.

        Under the hood starting a new transaction means serialize a lot of
        stuff and assemble an Astute message. So at the end of method we
        either send an Astute message with execution flow or mark transaction
        as failed.

        :param graphs: a list of graph type to be run on a given nodes
        :param dry_run: run a new transaction in dry run mode
        :param noop_run: run a new transaction in noop run mode
        :param force: re-evaluate tasks's conditions as it's a first run
        :param debug: enable debug mode for tasks executor
        """
        logger.info(
            'Start new transaction: '
            'cluster=%d graphs=%s dry_run=%d noop_run=%s force=%d ',
            self.cluster_id, graphs, dry_run, noop_run, force
        )

        # So far we don't support parallel execution of transactions within
        # one cluster. So we need to fail quickly in there's transaction
        # in-progress.
        cluster = self._acquire_cluster()

        # Unfortunately, by historical reasons UI polls 'deployment' tasks
        # for cluster and expects there's only one. That one is considered
        # as latest and is used for tracking progress and showing error
        # message. So we have came up with the following workaround:
        #
        #  * each new transaction we mark previous ones as deleted
        #  * /tasks endpoint doesn't return "deleted" transactions in response
        #  * /transactions endpoint does return "deleted" transactions
        #
        # FIXME: We must provide a way to get latest transaction with its
        #        sub-transactions via API. Once it's done, and UI uses it -
        #        we can safely remove this workaround.
        _remove_obsolete_tasks(cluster)

        transaction = objects.Transaction.create({
            'name': consts.TASK_NAMES.deploy,
            'cluster_id': self.cluster_id,
            'status': consts.TASK_STATUSES.running,
            'dry_run': dry_run or noop_run,
        })
        objects.Transaction.on_start(transaction)
        helpers.TaskHelper.create_action_log(transaction)

        for graph in graphs:
            # 'dry_run' flag is a part of transaction, so we can restore its
            # value anywhere. That doesn't apply to 'force' flag, because it
            # affects only context calculation. However we need somehow to
            # pass it down in order to build context once first graph
            # is executed (much much latter, when we call continue_ in RPC
            # receiver).
            cache = graph.copy()
            cache['force'] = force
            cache['noop_run'] = noop_run
            cache['dry_run'] = dry_run
            cache['debug'] = debug
            cache['subgraphs'] = subgraphs

            transaction.create_subtask(
                self.task_name,
                status=consts.TASK_STATUSES.pending,
                dry_run=dry_run or noop_run,
                graph_type=graph['type'],
                # We need to save input parameters in cache, so RPC receiver
                # can use them to do further serialization.
                #
                # FIXME: Consider to use a separate set of columns.
                cache=cache,
            )

        # We need to commit transaction because asynchronous call below might
        # be executed in separate process or thread.
        db().commit()

        self.continue_(transaction)
        return transaction

    def continue_(self, transaction):
        """Pick next pending task and send it to execution.

        Transaction may consist of a number of sub-transactions. We should
        execute them one-by-one. This method allows to pick first pending
        transaction and send it to execution.

        :param transaction: a top-level transaction to continue
        :return: True if sub transaction will be started, otherwise False
        """
        sub_transaction = next((
            sub_transaction
            for sub_transaction in transaction.subtasks
            if sub_transaction.status == consts.TASK_STATUSES.pending), None)

        if sub_transaction is None:
            # there is no sub-transaction, so we can close this transaction
            self.success(transaction)
            return False

        with try_transaction(transaction, self.fail):
            # uWSGI mule is a separate process, and that means it won't share
            # our DB session. Hence, we can't pass fetched DB instances to the
            # function we want to be executed in mule, so let's proceed with
            # unique identifiers.
            mule.call_task_manager_async(
                self.__class__,
                '_execute_async',
                self.cluster_id,
                sub_transaction.id,
            )
        return True

    def process(self, transaction, report):
        """Process feedback from executor (Astute).

        :param transaction: a transaction to handle (sibling, not top level)
        :param report: a report to process
        """
        nodes = report.get('nodes', [])
        error = report.get('error')
        status = report.get('status')
        progress = report.get('progress')

        # Report may contain two virtual nodes: master and cluster ('None').
        # Since we don't have them in database we should ensure we ain't
        # going to update them.
        nodes_params = {
            str(node['uid']): node for node in nodes
            if node['uid'] not in (consts.MASTER_NODE_UID, None)
        }
        nodes_instances = objects.NodeCollection.lock_for_update(
            objects.NodeCollection.filter_by_list(
                None, 'id', nodes_params.keys(), order_by=('id', )
            )
        ).all()

        _update_nodes(transaction, nodes_instances, nodes_params)
        _update_history(transaction, nodes)
        _update_transaction(transaction, status, progress, error)

        if status in (consts.TASK_STATUSES.error, consts.TASK_STATUSES.ready):
            objects.Transaction.on_finish(transaction, status)
            helpers.TaskHelper.update_action_log(transaction)
            if transaction.parent:
                # if transaction is completed successfully,
                #  we've got to initiate the next one in the chain
                if status == consts.TASK_STATUSES.ready:
                    self.continue_(transaction.parent)
                else:
                    self.fail(transaction.parent, error)

    def success(self, transaction):
        objects.Transaction.on_finish(transaction, consts.TASK_STATUSES.ready)
        helpers.TaskHelper.update_action_log(transaction)
        _update_cluster_status(transaction)
        notifier.notify(
            consts.NOTIFICATION_TOPICS.done,
            "Graph execution has been successfully completed."
            "You can check deployment history for detailed information.",
            transaction.cluster_id,
            None,
            task_uuid=transaction.uuid
        )

    def fail(self, transaction, reason):
        objects.Transaction.on_finish(
            transaction, consts.TASK_STATUSES.error, message=reason
        )
        helpers.TaskHelper.update_action_log(transaction)
        for sub_transaction in transaction.subtasks:
            if sub_transaction.status == consts.TASK_STATUSES.pending:
                # on_start and on_finish called to properly handle
                # status transition
                objects.Transaction.on_start(sub_transaction)
                objects.Transaction.on_finish(
                    sub_transaction, consts.TASK_STATUSES.error, "Aborted"
                )

        _update_cluster_status(transaction)
        notifier.notify(
            consts.NOTIFICATION_TOPICS.error,
            "Graph execution failed with error: '{0}'."
            "Please check deployment history for more details."
            .format(reason),
            transaction.cluster_id,
            None,
            task_uuid=transaction.uuid
        )
        return True

    def _execute_async(self, sub_transaction_id):
        sub_transaction = objects.Transaction.get_by_uid(sub_transaction_id)

        with try_transaction(sub_transaction.parent, self.fail):
            self._execute_sync(sub_transaction)

        # Since the whole function is executed in separate process, we must
        # commit all changes in order to do not lost them.
        db().commit()

    def _execute_sync(self, sub_transaction):
        cluster = sub_transaction.cluster
        graph = objects.Cluster.get_deployment_graph(
            cluster, sub_transaction.graph_type
        )
        nodes = _get_nodes_to_run(
            cluster,
            graph.get('node_filter'),
            sub_transaction.cache.get('nodes')
        )
        logger.debug(
            "execute graph %s on nodes %s",
            sub_transaction.graph_type, [n.id for n in nodes]
        )
        # we should initialize primary roles for cluster before
        # role resolve has been created
        objects.Cluster.set_primary_tags(cluster, nodes)
        resolver = resolvers.TagResolver(nodes)
        _adjust_graph_tasks(
            graph,
            cluster,
            resolver,
            sub_transaction.cache.get('tasks'))

        context = lcm.TransactionContext(
            _get_expected_state(cluster, nodes),
            _get_current_state(
                cluster, nodes, graph['tasks'],
                sub_transaction.cache.get('force')
            ))

        _prepare_nodes(nodes, sub_transaction.dry_run, context.new['nodes'])

        # Attach desired state to the sub transaction, so when we continue
        # our top-level transaction, the new state will be calculated on
        # top of this.
        _dump_expected_state(sub_transaction, context.new, graph['tasks'])

        message = make_astute_message(
            sub_transaction, context, graph, resolver
        )
        objects.Transaction.on_start(sub_transaction)
        helpers.TaskHelper.create_action_log(sub_transaction)

        # Once rpc.cast() is called, the message is sent to Astute. By
        # that moment all transaction instanced must exist in database,
        # otherwise we may get wrong result due to RPC receiver won't
        # found entry to update.
        db().commit()
        rpc.cast('naily', [message])

    def _acquire_cluster(self):
        cluster = objects.Cluster.get_by_uid(
            self.cluster_id, fail_if_not_found=True, lock_for_update=True
        )
        running_tasks = objects.TaskCollection.all_in_progress(
            cluster_id=cluster.id
        )
        # TODO(bgaifullin) need new lock approach for cluster
        if objects.TaskCollection.count(running_tasks):
            raise errors.DeploymentAlreadyStarted()
        return cluster


def _remove_obsolete_tasks(cluster):
    all_tasks = objects.TaskCollection.all_not_deleted()
    cluster_tasks = objects.TaskCollection.filter_by(
        all_tasks, cluster_id=cluster.id
    )
    finished_tasks = objects.TaskCollection.filter_by_list(
        cluster_tasks, 'status',
        [consts.TASK_STATUSES.ready, consts.TASK_STATUSES.error]
    )
    finished_tasks = objects.TaskCollection.order_by(finished_tasks, 'id')

    for task in finished_tasks:
        objects.Task.delete(task)

    db().flush()


def _get_nodes_to_run(cluster, node_filter, ids=None):
    # Trying to run tasks on offline nodes will lead to error, since most
    # probably MCollective is unreachable. In order to avoid that, we need
    # to select only online nodes.
    nodes = objects.NodeCollection.filter_by(
        None, cluster_id=cluster.id, online=True)

    if node_filter is None:
        node_filter = _DEFAULT_NODE_FILTER

    if ids is None and node_filter:
        logger.debug("applying nodes filter: %s", node_filter)
        # TODO(bgaifullin) Need to implement adapter for YAQL
        # to direct query data from DB instead of query all data from DB
        yaql_exp = yaql_ext.get_default_engine()(
            '$.where({0}).select($.id)'.format(node_filter)
        )
        ids = yaql_exp.evaluate(
            data=objects.NodeCollection.to_list(
                nodes,
                # TODO(bgaifullin) remove hard-coded list of fields
                # the field network_data causes fail of following
                # cluster serialization because it modifies attributes of
                # node and this update will be stored in DB.
                serializer=node_serializers.NodeSerializerForDeployment
            ),
            context=yaql_ext.create_context(
                add_extensions=True, yaqlized=False
            )
        )

    if ids is not None:
        logger.debug("filter by node_ids: %s", ids)
        nodes = objects.NodeCollection.filter_by_list(nodes, 'id', ids)

    return objects.NodeCollection.lock_for_update(
        objects.NodeCollection.order_by(nodes, 'id')
    ).all()


def _adjust_graph_tasks(graph, cluster, node_resolver, names=None):
    if objects.Cluster.is_propagate_task_deploy_enabled(cluster):
        # TODO(bgaifullin) move this code into Cluster.get_deployment_tasks
        # after dependency from role_resolver will be removed
        if graph['type'] == consts.DEFAULT_DEPLOYMENT_GRAPH_TYPE:
            plugin_tasks = objects.Cluster.get_legacy_plugin_tasks(cluster)
        else:
            plugin_tasks = None

        graph['tasks'] = list(legacy_tasks_adapter.adapt_legacy_tasks(
            graph['tasks'], plugin_tasks, node_resolver
        ))

    if names:
        # filter task by names, mark all other task as skipped
        task_ids = set(names)
        tasks = graph['tasks']
        for idx, task in enumerate(tasks):
            if (task['id'] not in task_ids and
                    task['type'] not in consts.INTERNAL_TASKS):

                task = task.copy()
                task['type'] = consts.ORCHESTRATOR_TASK_TYPES.skipped
                tasks[idx] = task


def _is_node_for_redeploy(node):
    if node is None:
        return False
    if node.pending_addition:
        return True

    return node.error_type or node.status in (
        consts.NODE_STATUSES.discover,
        consts.NODE_STATUSES.error,
        consts.NODE_STATUSES.provisioned,
        consts.NODE_STATUSES.stopped,
    )


def _get_current_state(cluster, nodes, tasks, force=False):
    # In case of force=True, the current state is {} which means: behave like
    # an intial deployment.
    if force:
        return {}

    nodes = {n.uid: n for n in nodes}
    nodes[consts.MASTER_NODE_UID] = None
    tasks_names = [
        t['id'] for t in tasks if t['type'] not in consts.INTERNAL_TASKS
    ]

    txs = objects.TransactionCollection.get_successful_transactions_per_task(
        cluster.id, tasks_names, nodes
    )
    state = {}
    for tx, data in itertools.groupby(txs, lambda x: x[0]):
        node_ids = []
        common_attrs = {}
        deferred_state = {}
        for _, node_id, task_name in data:
            t_state = state.setdefault(task_name, {
                'nodes': {}, 'common': common_attrs
            })
            if _is_node_for_redeploy(nodes.get(node_id)):
                t_state['nodes'][node_id] = {}
            else:
                t_state['nodes'][node_id] = deferred_state.setdefault(
                    node_id, {}
                )
                node_ids.append(node_id)

        deployment_info = objects.Transaction.get_deployment_info(
            tx, node_uids=node_ids)

        common_attrs.update(deployment_info['common'])
        dict_update(deferred_state, deployment_info['nodes'], level=2)

    return state


def _get_expected_state(cluster, nodes):
    info = deployment_serializers.serialize_for_lcm(cluster, nodes)
    info['nodes'] = {n['uid']: n for n in info['nodes']}
    # Added cluster state
    info['nodes'][None] = {}
    return info


def _dump_expected_state(transaction, state, tasks):
    cluster = transaction.cluster

    objects.Transaction.attach_deployment_info(transaction, state)
    objects.Transaction.attach_tasks_snapshot(transaction, tasks)
    objects.Transaction.attach_cluster_settings(
        transaction,
        {
            'editable': objects.Cluster.get_editable_attributes(cluster, True)
        })
    objects.Transaction.attach_network_settings(
        transaction, objects.Cluster.get_network_attributes(cluster))

    db().flush()


def _prepare_nodes(nodes, dry_run, involved_node_ids):
    for node in (node for node in nodes if node.uid in involved_node_ids):
        # set progress to show that node is in progress state
        node.progress = 1
        if not dry_run:
            node.error_type = None
            node.error_msg = None


def _update_nodes(transaction, nodes_instances, nodes_params):
    allow_update = {
        'name',
        'status',
        'hostname',
        'kernel_params',
        'pending_addition',
        'pending_deletion',
        'error_msg',
        'online',
        'progress',
    }

    # dry-run transactions must not update nodes except progress column
    if transaction.dry_run:
        allow_update = {'progress'}

    for node in nodes_instances:
        node_params = nodes_params.pop(node.uid)

        for param in allow_update.intersection(node_params):
            if param == 'status':
                new_status = node_params['status']
                if new_status == 'deleted':
                    # the deleted is special status which causes
                    # to delete node from cluster
                    objects.Node.remove_from_cluster(node)
                elif new_status == 'error':
                    # TODO(bgaifullin) do not persist status in DB
                    node.status = new_status
                    node.error_type = node_params.get(
                        'error_type', consts.NODE_ERRORS.deploy
                    )
                    node.progress = 100
                    # Notification on particular node failure
                    notifier.notify(
                        consts.NOTIFICATION_TOPICS.error,
                        u"Node '{0}' failed: {1}".format(
                            node.name,
                            node_params.get('error_msg', "Unknown error")
                        ),
                        cluster_id=transaction.cluster_id,
                        node_id=node.uid,
                        task_uuid=transaction.uuid
                    )
                elif new_status == 'ready':
                    # TODO(bgaifullin) need to remove pengind roles concept
                    node.roles = list(set(node.roles + node.pending_roles))
                    node.pending_roles = []
                    node.progress = 100
                    node.status = new_status
                else:
                    node.status = new_status
            else:
                setattr(node, param, node_params[param])
    db.flush()

    if nodes_params:
        logger.warning(
            "The following nodes are not found: %s",
            ",".join(sorted(nodes_params.keys()))
        )


def _update_history(transaction, nodes):
    for node in nodes:
        if {'deployment_graph_task_name', 'task_status'}.issubset(node.keys()):
            objects.DeploymentHistory.update_if_exist(
                transaction.id,
                node['uid'],
                node['deployment_graph_task_name'],
                node['task_status'],
                node.get('summary'),
                node.get('custom'),
            )
    db.flush()


def _update_transaction(transaction, status, progress, message):
    data = {}
    if status:
        data['status'] = status
    if message:
        data['message'] = message
    data['progress'] = _calculate_progress(transaction, progress)
    if data:
        objects.Transaction.update(transaction, data)

    if transaction.parent and data['progress']:
        logger.debug("Updating parent task: %s.", transaction.parent.uuid)
        siblings = transaction.parent.subtasks
        total_progress = sum(x.progress for x in siblings)
        objects.Transaction.update(transaction.parent, {
            'progress': total_progress // len(siblings)
        })


def _calculate_progress(transaction, progress):
    if progress is not None:
        return progress
    else:
        return helpers.TaskHelper.recalculate_deployment_task_progress(
            transaction)


def _update_cluster_status(transaction):
    if transaction.dry_run:
        return

    nodes = objects.NodeCollection.filter_by(
        None, cluster_id=transaction.cluster_id
    )
    failed_nodes = objects.NodeCollection.filter_by_not(nodes, error_type=None)
    not_ready_nodes = objects.NodeCollection.filter_by_not(
        nodes, status=consts.NODE_STATUSES.ready
    )
    # if all nodes are ready - cluster has operational status
    # otherwise cluster has partially deployed status
    if (objects.NodeCollection.count(failed_nodes) or
            objects.NodeCollection.count(not_ready_nodes)):
        status = consts.CLUSTER_STATUSES.partially_deployed
    else:
        status = consts.CLUSTER_STATUSES.operational

    objects.Cluster.update(transaction.cluster, {'status': status})