mistral/mistral/engine/workflows.py

# Copyright 2016 - Nokia Networks.
# Copyright 2016 - Brocade Communications Systems, Inc.
# Copyright 2018 - Extreme Networks, Inc.
#
#    Licensed under the Apache License, Version 2.0 (the "License");
#    you may not use this file except in compliance with the License.
#    You may obtain a copy of the License at
#
#        http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS,
#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#    See the License for the specific language governing permissions and
#    limitations under the License.

import abc
import json
from oslo_config import cfg
from oslo_log import log as logging
from osprofiler import profiler
import six

from mistral.db.v2 import api as db_api
from mistral.db.v2.sqlalchemy import models as db_models
from mistral.engine import dispatcher
from mistral.engine import post_tx_queue
from mistral.engine import utils as engine_utils
from mistral import exceptions as exc
from mistral import expressions as expr
from mistral.lang import parser as spec_parser
from mistral.notifiers import base as notif
from mistral.notifiers import notification_events as events
from mistral.rpc import clients as rpc
from mistral.services import triggers
from mistral.services import workflows as wf_service
from mistral import utils
from mistral.utils import merge_dicts
from mistral.utils import wf_trace
from mistral.workflow import base as wf_base
from mistral.workflow import commands
from mistral.workflow import data_flow
from mistral.workflow import lookup_utils
from mistral.workflow import states
from mistral_lib import actions as ml_actions


LOG = logging.getLogger(__name__)


@six.add_metaclass(abc.ABCMeta)
class Workflow(object):
    """Workflow.

    Represents a workflow and defines interface that can be used by
    Mistral engine or its components in order to manipulate with workflows.
    """

    def __init__(self, wf_ex=None):
        self.wf_ex = wf_ex

        if wf_ex:
            # We're processing a workflow that's already in progress.
            self.wf_spec = spec_parser.get_workflow_spec_by_execution_id(
                wf_ex.id
            )
        else:
            self.wf_spec = None

    def notify(self, event):
        publishers = self.wf_ex.params.get('notify')

        if not publishers and not isinstance(publishers, list):
            return

        notifier = notif.get_notifier(cfg.CONF.notifier.type)

        notifier.notify(
            self.wf_ex.id,
            self.wf_ex.to_dict(),
            event,
            self.wf_ex.updated_at,
            publishers
        )

    @profiler.trace('workflow-start')
    def start(self, wf_def, wf_ex_id, input_dict, desc='', params=None):
        """Start workflow.

        :param wf_def: Workflow definition.
        :param wf_ex_id: Workflow execution id.
        :param input_dict: Workflow input.
        :param desc: Workflow execution description.
        :param params: Workflow type specific parameters.

        :raises
        """

        assert not self.wf_ex

        # New workflow execution.
        self.wf_spec = spec_parser.get_workflow_spec_by_definition_id(
            wf_def.id,
            wf_def.updated_at
        )

        wf_trace.info(
            self.wf_ex,
            'Starting workflow [name=%s, input=%s]' %
            (wf_def.name, utils.cut(input_dict))
        )

        self.validate_input(input_dict)

        self._create_execution(
            wf_def,
            wf_ex_id,
            self.prepare_input(input_dict),
            desc,
            params
        )

        self.set_state(states.RUNNING)

        # Publish event as soon as state is set to running.
        self.notify(events.WORKFLOW_LAUNCHED)

        wf_ctrl = wf_base.get_controller(self.wf_ex, self.wf_spec)

        dispatcher.dispatch_workflow_commands(
            self.wf_ex,
            wf_ctrl.continue_workflow()
        )

    def stop(self, state, msg=None):
        """Stop workflow.

        :param state: New workflow state.
        :param msg: Additional explaining message.
        """
        assert self.wf_ex

        if state == states.SUCCESS:
            self._succeed_workflow(self._get_final_context(), msg)
        elif state == states.ERROR:
            self._fail_workflow(self._get_final_context(), msg)
        elif state == states.CANCELLED:
            self._cancel_workflow(msg)

    def pause(self, msg=None):
        """Pause workflow.

        :param msg: Additional explaining message.
        """

        assert self.wf_ex

        if states.is_paused(self.wf_ex.state):
            return

        # Set the state of this workflow to paused.
        self.set_state(states.PAUSED, state_info=msg)

        # Publish event.
        self.notify(events.WORKFLOW_PAUSED)

        # If workflow execution is a subworkflow,
        # schedule update to the task execution.
        if self.wf_ex.task_execution_id:
            # Import the task_handler module here to avoid circular reference.
            from mistral.engine import task_handler
            task_handler.schedule_on_action_update(self.wf_ex)

    def resume(self, env=None):
        """Resume workflow.

        :param env: Environment.
        """

        assert self.wf_ex

        wf_service.update_workflow_execution_env(self.wf_ex, env)

        self.set_state(states.RUNNING)

        # Publish event.
        self.notify(events.WORKFLOW_RESUMED)

        wf_ctrl = wf_base.get_controller(self.wf_ex)

        # Calculate commands to process next.
        cmds = wf_ctrl.continue_workflow()

        self._continue_workflow(cmds)

        # If workflow execution is a subworkflow,
        # schedule update to the task execution.
        if self.wf_ex.task_execution_id:
            # Import the task_handler module here to avoid circular reference.
            from mistral.engine import task_handler

            task_handler.schedule_on_action_update(self.wf_ex)

    def prepare_input(self, input_dict):
        for k, v in self.wf_spec.get_input().items():
            if k not in input_dict or input_dict[k] is utils.NotDefined:
                input_dict[k] = v

        return input_dict

    def validate_input(self, input_dict):
        engine_utils.validate_input(
            self.wf_spec.get_input(),
            input_dict,
            self.wf_spec.get_name(),
            self.wf_spec.__class__.__name__
        )

    def rerun(self, task_ex, reset=True, env=None):
        """Rerun workflow from the given task.

        :param task_ex: Task execution that the workflow needs to rerun from.
        :param reset: If True, reset task state including deleting its action
            executions.
        :param env: Environment.
        """

        assert self.wf_ex

        # Since some lookup utils functions may use cache for completed tasks
        # we need to clean caches to make sure that stale objects can't be
        # retrieved.
        lookup_utils.clear_caches()

        wf_service.update_workflow_execution_env(self.wf_ex, env)

        self._recursive_rerun()

        wf_ctrl = wf_base.get_controller(self.wf_ex)

        # Calculate commands to process next.
        cmds = wf_ctrl.rerun_tasks([task_ex], reset=reset)

        if cmds:
            # Import the task_handler module here to avoid circular reference.
            from mistral.engine import policies

            policies.RetryPolicy.refresh_runtime_context(task_ex)

        self._continue_workflow(cmds)

    def _recursive_rerun(self):
        """Rerun all parent workflow executions recursively.

        If there is a parent execution that it reruns as well.
        """

        from mistral.engine import workflow_handler

        self.set_state(states.RUNNING)

        # TODO(rakhmerov): We call a internal method of a module here.
        # The simplest way is to make it public, however, I believe
        # it's another "bad smell" that tells that some refactoring
        # of the architecture should be made.
        workflow_handler._schedule_check_and_fix_integrity(self.wf_ex)

        if self.wf_ex.task_execution_id:
            parent_task_ex = db_api.get_task_execution(
                self.wf_ex.task_execution_id
            )

            parent_wf = Workflow(wf_ex=parent_task_ex.workflow_execution)

            parent_wf.lock()

            parent_wf._recursive_rerun()

            from mistral.engine import task_handler
            task_handler.rerun_task(parent_task_ex, parent_wf.wf_spec)

    def _get_backlog(self):
        return self.wf_ex.runtime_context.get(dispatcher.BACKLOG_KEY)

    def _continue_workflow(self, cmds):
        # When resuming a workflow we need to ignore all 'pause'
        # commands because workflow controller takes tasks that
        # completed within the period when the workflow was paused.
        cmds = list(
            [c for c in cmds if not isinstance(c, commands.PauseWorkflow)]
        )

        # Since there's no explicit task causing the operation
        # we need to mark all not processed tasks as processed
        # because workflow controller takes only completed tasks
        # with flag 'processed' equal to False.
        for t_ex in self.wf_ex.task_executions:
            if states.is_completed(t_ex.state) and not t_ex.processed:
                t_ex.processed = True

        if cmds or self._get_backlog():
            dispatcher.dispatch_workflow_commands(self.wf_ex, cmds)
        else:
            self.check_and_complete()

    @profiler.trace('workflow-lock')
    def lock(self):
        assert self.wf_ex

        return db_api.acquire_lock(db_models.WorkflowExecution, self.wf_ex.id)

    def _get_final_context(self):
        final_ctx = {}

        wf_ctrl = wf_base.get_controller(self.wf_ex)

        try:
            final_ctx = wf_ctrl.evaluate_workflow_final_context()
        except Exception as e:
            LOG.warning(
                'Failed to get final context for workflow execution. '
                '[wf_ex_id: %s, wf_name: %s, error: %s]',
                self.wf_ex.id,
                self.wf_ex.name,
                str(e)
            )

        return final_ctx

    def _create_execution(self, wf_def, wf_ex_id, input_dict, desc, params):
        self.wf_ex = db_api.create_workflow_execution({
            'id': wf_ex_id,
            'name': wf_def.name,
            'description': desc,
            'workflow_name': wf_def.name,
            'workflow_namespace': wf_def.namespace,
            'workflow_id': wf_def.id,
            'spec': self.wf_spec.to_dict(),
            'state': states.IDLE,
            'output': {},
            'task_execution_id': params.get('task_execution_id'),
            'root_execution_id': params.get('root_execution_id'),
            'runtime_context': {
                'index': params.get('index', 0)
            },
        })

        self.wf_ex.input = input_dict or {}

        params['env'] = _get_environment(params)

        self.wf_ex.params = params

        data_flow.add_openstack_data_to_context(self.wf_ex)
        data_flow.add_execution_to_context(self.wf_ex)
        data_flow.add_workflow_variables_to_context(self.wf_ex, self.wf_spec)

        spec_parser.cache_workflow_spec_by_execution_id(
            self.wf_ex.id,
            self.wf_spec
        )

    @profiler.trace('workflow-set-state')
    def set_state(self, state, state_info=None):
        assert self.wf_ex

        cur_state = self.wf_ex.state

        if states.is_valid_transition(cur_state, state):
            wf_ex = db_api.update_workflow_execution_state(
                id=self.wf_ex.id,
                cur_state=cur_state,
                state=state
            )

            if wf_ex is None:
                # Do nothing because the state was updated previously.
                return False

            self.wf_ex = wf_ex
            self.wf_ex.state_info = json.dumps(state_info) \
                if isinstance(state_info, dict) else state_info

            wf_trace.info(
                self.wf_ex,
                "Workflow '%s' [%s -> %s, msg=%s]" %
                (self.wf_ex.workflow_name,
                 cur_state,
                 state,
                 self.wf_ex.state_info)
            )
        else:
            msg = ("Can't change workflow execution state from %s to %s. "
                   "[workflow=%s, execution_id=%s]" %
                   (cur_state, state, self.wf_ex.name, self.wf_ex.id))

            raise exc.WorkflowException(msg)

        # Workflow result should be accepted by parent workflows (if any)
        # only if it completed successfully or failed.
        self.wf_ex.accepted = states.is_completed(state)

        if states.is_completed(state):
            # No need to keep task executions of this workflow in the
            # lookup cache anymore.
            lookup_utils.invalidate_cached_task_executions(self.wf_ex.id)

            triggers.on_workflow_complete(self.wf_ex)

        return True

    @profiler.trace('workflow-check-and-complete')
    def check_and_complete(self):
        """Completes the workflow if it needs to be completed.

        The method simply checks if there are any tasks that are not
        in a terminal state. If there aren't any then it performs all
        necessary logic to finalize the workflow (calculate output etc.).
        :return: Number of incomplete tasks.
        """

        if states.is_paused_or_completed(self.wf_ex.state):
            return 0

        # Workflow is not completed if there are any incomplete task
        # executions.
        incomplete_tasks_count = db_api.get_incomplete_task_executions_count(
            workflow_execution_id=self.wf_ex.id,
        )

        if incomplete_tasks_count > 0:
            return incomplete_tasks_count

        LOG.debug("Workflow completed [id=%s]", self.wf_ex.id)

        # NOTE(rakhmerov): Once we know that the workflow has completed,
        # we need to expire all the objects in the DB session to make sure
        # to read the most relevant data from the DB (that's already been
        # committed in parallel transactions). Otherwise, some data like
        # workflow context may be stale and decisions made upon it will be
        # wrong.
        db_api.expire_all()

        wf_ctrl = wf_base.get_controller(self.wf_ex, self.wf_spec)

        if wf_ctrl.any_cancels():
            msg = _build_cancel_info_message(wf_ctrl, self.wf_ex)

            self._cancel_workflow(msg)
        elif wf_ctrl.all_errors_handled():
            ctx = wf_ctrl.evaluate_workflow_final_context()

            self._succeed_workflow(ctx)
        else:
            msg = _build_fail_info_message(wf_ctrl, self.wf_ex)
            final_context = wf_ctrl.evaluate_workflow_final_context()

            self._fail_workflow(final_context, msg)

        return 0

    def _succeed_workflow(self, final_context, msg=None):
        output = data_flow.evaluate_workflow_output(
            self.wf_ex,
            self.wf_spec.get_output(),
            final_context
        )

        # Set workflow execution to success after output is evaluated.
        if not self.set_state(states.SUCCESS, msg):
            return

        self.wf_ex.output = output

        # Publish event.
        self.notify(events.WORKFLOW_SUCCEEDED)

        if self.wf_ex.task_execution_id:
            self._send_result_to_parent_workflow()

    def _fail_workflow(self, final_context, msg):
        if states.is_paused_or_completed(self.wf_ex.state):
            return

        output_on_error = {}

        try:
            output_on_error = data_flow.evaluate_workflow_output(
                self.wf_ex,
                self.wf_spec.get_output_on_error(),
                final_context
            )
        except exc.MistralException as e:
            msg = (
                "Failed to evaluate expression in output-on-error! "
                "(output-on-error: '%s', exception: '%s' Cause: '%s'"
                % (self.wf_spec.get_output_on_error(), e, msg)
            )
            LOG.error(msg)

        if not self.set_state(states.ERROR, state_info=msg):
            return

        # When we set an ERROR state we should safely set output value getting
        # w/o exceptions due to field size limitations.

        length_output_on_error = len(str(output_on_error).encode("utf-8"))
        total_output_length = utils.get_number_of_chars_from_kilobytes(
            cfg.CONF.engine.execution_field_size_limit_kb)

        if length_output_on_error < total_output_length:
            msg = utils.cut_by_char(
                msg,
                total_output_length - length_output_on_error
            )
        else:
            msg = utils.cut_by_kb(
                msg,
                cfg.CONF.engine.execution_field_size_limit_kb
            )

        self.wf_ex.output = merge_dicts({'result': msg}, output_on_error)

        # Publish event.
        self.notify(events.WORKFLOW_FAILED)

        if self.wf_ex.task_execution_id:
            self._send_result_to_parent_workflow()

    def _cancel_workflow(self, msg):
        if states.is_completed(self.wf_ex.state):
            return

        if not self.set_state(states.CANCELLED, state_info=msg):
            return

        # When we set an ERROR state we should safely set output value getting
        # w/o exceptions due to field size limitations.
        msg = utils.cut_by_kb(
            msg,
            cfg.CONF.engine.execution_field_size_limit_kb
        )

        self.wf_ex.output = {'result': msg}

        # Publish event.
        self.notify(events.WORKFLOW_CANCELLED)

        if self.wf_ex.task_execution_id:
            self._send_result_to_parent_workflow()

    def _send_result_to_parent_workflow(self):
        if self.wf_ex.state == states.SUCCESS:
            # The result of the sub workflow is already saved
            # so there's no need to send it over RPC.
            result = None
        elif self.wf_ex.state == states.ERROR:
            err_msg = (
                self.wf_ex.state_info or
                'Failed subworkflow [execution_id=%s]' % self.wf_ex.id
            )

            result = ml_actions.Result(error=err_msg)
        elif self.wf_ex.state == states.CANCELLED:
            err_msg = (
                self.wf_ex.state_info or
                'Cancelled subworkflow [execution_id=%s]' % self.wf_ex.id
            )

            result = ml_actions.Result(error=err_msg, cancel=True)
        else:
            raise RuntimeError(
                "Method _send_result_to_parent_workflow() must never be called"
                " if a workflow is not in SUCCESS, ERROR or CANCELLED state."
            )

        # Register a command executed in a separate thread to send the result
        # to the parent workflow outside of the main DB transaction.
        def _send_result():
            rpc.get_engine_client().on_action_complete(
                self.wf_ex.id,
                result,
                wf_action=True
            )

        post_tx_queue.register_operation(_send_result)


def _get_environment(params):
    env = params.get('env', {})

    if not env:
        return {}

    if isinstance(env, dict):
        env_dict = env
    elif isinstance(env, six.string_types):
        env_db = db_api.load_environment(env)

        if not env_db:
            raise exc.InputException(
                'Environment is not found: %s' % env
            )

        env_dict = env_db.variables
    else:
        raise exc.InputException(
            'Unexpected value type for environment [env=%s, type=%s]'
            % (env, type(env))
        )

    if ('evaluate_env' in params and
            not params['evaluate_env']):
        return env_dict
    else:
        return expr.evaluate_recursively(env_dict, {'__env': env_dict})


def _build_fail_info_message(wf_ctrl, wf_ex):
    # Try to find where error is exactly.
    failed_tasks = sorted(
        filter(
            lambda t: not wf_ctrl.is_error_handled_for(t),
            lookup_utils.find_error_task_executions(wf_ex.id)
        ),
        key=lambda t: t.name
    )

    msg = ('Failure caused by error in tasks: %s\n' %
           ', '.join([t.name for t in failed_tasks]))

    for t in failed_tasks:
        msg += '\n  %s [task_ex_id=%s] -> %s\n' % (t.name, t.id, t.state_info)

        for i, ex in enumerate(t.action_executions):
            if ex.state == states.ERROR:
                output = (ex.output or dict()).get('result', 'Unknown')
                msg += (
                    '    [action_ex_id=%s, idx=%s]: %s\n' % (
                        ex.id,
                        i,
                        str(output)
                    )
                )

        for i, ex in enumerate(t.workflow_executions):
            if ex.state == states.ERROR:
                output = (ex.output or dict()).get('result', 'Unknown')
                msg += (
                    '    [wf_ex_id=%s, idx=%s]: %s\n' % (
                        ex.id,
                        i,
                        str(output)
                    )
                )

    return msg


def _build_cancel_info_message(wf_ctrl, wf_ex):
    # Try to find where cancel is exactly.
    cancelled_tasks = sorted(
        lookup_utils.find_cancelled_task_executions(wf_ex.id),
        key=lambda t: t.name
    )

    return (
        'Cancelled tasks: %s' % ', '.join([t.name for t in cancelled_tasks])
    )