Redesigning engine to move all remote calls from transactions

* Fixes engine race condition between start_workflow and on_task_result methods * Engine commands now have local and remote parts (in fact, "in tx" and "non tx") Closes-Bug: #1395679 Change-Id: Icd4aa1a546893b815c01bea23880cde139df2d1b
2014-11-26 15:43:41 +06:00 · 2014-11-26 15:43:41 +06:00 · 9da521b841
commit 9da521b841
parent ec47d0f269
9 changed files with 364 additions and 35 deletions
--- a/mistral/engine1/commands.py
+++ b/mistral/engine1/commands.py
@ -44,9 +44,13 @@ def _log_execution_state_change(name, from_state, to_state):
 class EngineCommand(object):
    """Engine command interface."""
-    @abc.abstractmethod
+    def run_local(self, exec_db, wf_handler, cause_task_db=None):
-    def run(self, exec_db, wf_handler, cause_task_db=None):
+        """Runs local part of the command.
-        """Runs the command.
+
        "Local" means that the code can be performed within a scope
        of an opened DB transaction. For example, for all commands
        that simply change a state of execution (e.g. depending on
        some conditions) it's enough to implement only this method.
        :param exec_db: Workflow execution DB object.
        :param wf_handler: Workflow handler currently being used.
@ -54,12 +58,32 @@ class EngineCommand(object):
        :return False if engine should stop further command processing,
            True otherwise.
        """
-        raise NotImplementedError
+        return True
    def run_remote(self, exec_db, wf_handler, cause_task_db=None):
        """Runs remote part of the command.
        "Remote" means that the code cannot be performed within a scope
        of an opened DB transaction. All commands that deal with remote
        invocations should implement this method. However, they may also
        need to implement "run_local" if they need to do something with
        DB state of execution and/or tasks.
        :param exec_db: Workflow execution DB object.
        :param wf_handler: Workflow handler currently being used.
        :param cause_task_db: Task that caused the command to run.
        :return False if engine should stop further command processing,
            True otherwise.
        """
        return True
 class Noop(EngineCommand):
    """No-op command."""
-    def run(self, exec_db, wf_handler, cause_task_db=None):
+    def run_local(self, exec_db, wf_handler, cause_task_db=None):
        pass
    def run_remote(self, exec_db, wf_handler, cause_task_db=None):
        pass
@ -68,12 +92,14 @@ class RunTask(EngineCommand):
        self.task_spec = task_spec
        self.task_db = task_db
-    def run(self, exec_db, wf_handler, cause_task_db=None):
+        if task_db:
            self.exec_db = task_db.execution
    def run_local(self, exec_db, wf_handler, cause_task_db=None):
        LOG.debug('Running workflow task: %s' % self.task_spec)
        self._prepare_task(exec_db, wf_handler, cause_task_db)
        self._before_task_start(wf_handler.wf_spec)
        self._run_task()
        return True
@ -82,6 +108,7 @@ class RunTask(EngineCommand):
            return
        self.task_db = self._create_db_task(exec_db)
        self.exec_db = self.task_db.execution
        # Evaluate Data Flow properties ('input', 'in_context').
        data_flow.prepare_db_task(
@ -111,6 +138,11 @@ class RunTask(EngineCommand):
            'project_id': exec_db.project_id
        })
    def run_remote(self, exec_db, wf_handler, cause_task_db=None):
        self._run_task()
        return True
    def _run_task(self):
        # Policies could possibly change task state.
        if self.task_db.state != states.RUNNING:
@ -130,7 +162,7 @@ class RunTask(EngineCommand):
            self._run_workflow()
    def _run_action(self):
-        exec_db = self.task_db.execution
+        exec_db = self.exec_db
        wf_spec = spec_parser.get_workflow_spec(exec_db.wf_spec)
        action_spec_name = self.task_spec.get_action_name()
@ -178,8 +210,10 @@ class RunTask(EngineCommand):
            action_input.update(a_m.get_action_context(self.task_db))
        for_each = self.task_spec.get_for_each()
        if for_each:
            action_input_collection = self._calc_for_each_input(action_input)
            for a_input in action_input_collection:
                rpc.get_executor_client().run_action(
                    self.task_db.id,
@ -204,11 +238,13 @@ class RunTask(EngineCommand):
                targets
            )
-    def _calc_for_each_input(self, action_input):
+    @staticmethod
    def _calc_for_each_input(action_input):
        # In case of for-each iterate over action_input and send
        # each part of data to executor.
        # Calculate action input collection for separating input.
        action_input_collection = []
        for key, value in action_input.items():
            for index, item in enumerate(value):
                iter_context = {key: item}
@ -221,7 +257,7 @@ class RunTask(EngineCommand):
        return action_input_collection
    def _run_workflow(self):
-        parent_exec_db = self.task_db.execution
+        parent_exec_db = self.exec_db
        parent_wf_spec = spec_parser.get_workflow_spec(parent_exec_db.wf_spec)
        wf_spec_name = self.task_spec.get_workflow_name()
@ -251,7 +287,7 @@ class RunTask(EngineCommand):
 class FailWorkflow(EngineCommand):
-    def run(self, exec_db, wf_handler, cause_task_db=None):
+    def run_local(self, exec_db, wf_handler, cause_task_db=None):
        _log_execution_state_change(
            exec_db.wf_name,
            exec_db.state,
@ -262,9 +298,12 @@ class FailWorkflow(EngineCommand):
        return False
    def run_remote(self, exec_db, wf_handler, cause_task_db=None):
        return False
 class SucceedWorkflow(EngineCommand):
-    def run(self, exec_db, wf_handler, cause_task_db=None):
+    def run_local(self, exec_db, wf_handler, cause_task_db=None):
        _log_execution_state_change(
            exec_db.wf_name,
            exec_db.state,
@ -275,18 +314,32 @@ class SucceedWorkflow(EngineCommand):
        return False
    def run_remote(self, exec_db, wf_handler, cause_task_db=None):
        return False
 class PauseWorkflow(EngineCommand):
-    def run(self, exec_db, wf_handler, cause_task_db=None):
+    def run_local(self, exec_db, wf_handler, cause_task_db=None):
        _log_execution_state_change(
            exec_db.wf_name,
            exec_db.state,
            states.PAUSED
        )
        wf_handler.pause_workflow()
        return False
    def run_remote(self, exec_db, wf_handler, cause_task_db=None):
        return False
 class RollbackWorkflow(EngineCommand):
-    def run(self, exec_db, wf_handler, cause_task_db=None):
+    def run_local(self, exec_db, wf_handler, cause_task_db=None):
-        pass
+        return True
    def run_remote(self, exec_db, wf_handler, cause_task_db=None):
        return True
 RESERVED_COMMANDS = {
--- a/mistral/engine1/default_engine.py
+++ b/mistral/engine1/default_engine.py
@ -69,7 +69,9 @@ class DefaultEngine(base.Engine):
            # Calculate commands to process next.
            cmds = wf_handler.start_workflow(**params)
-            self._run_commands(cmds, exec_db, wf_handler)
+            self._run_local_commands(cmds, exec_db, wf_handler)
        self._run_remote_commands(cmds, exec_db, wf_handler)
        return exec_db
@ -95,9 +97,16 @@ class DefaultEngine(base.Engine):
            # Calculate commands to process next.
            cmds = wf_handler.on_task_result(task_db, raw_result)
-            self._run_commands(cmds, exec_db, wf_handler, task_db)
+            self._run_local_commands(
                cmds,
                exec_db,
                wf_handler,
                task_db
            )
-            self._check_subworkflow_completion(exec_db)
+        self._run_remote_commands(cmds, exec_db, wf_handler)
        self._check_subworkflow_completion(exec_db)
        return task_db
@ -122,7 +131,9 @@ class DefaultEngine(base.Engine):
            # Calculate commands to process next.
            cmds = wf_handler.resume_workflow()
-            self._run_commands(cmds, exec_db, wf_handler)
+            self._run_local_commands(cmds, exec_db, wf_handler)
        self._run_remote_commands(cmds, exec_db, wf_handler)
        return exec_db
@ -131,13 +142,26 @@ class DefaultEngine(base.Engine):
        raise NotImplementedError
    @staticmethod
-    def _run_commands(cmds, exec_db, wf_handler, cause_task_db=None):
+    def _run_local_commands(cmds, exec_db, wf_handler, cause_task_db=None):
        if not cmds:
            return
        for cmd in cmds:
-            if not cmd.run(exec_db, wf_handler, cause_task_db):
+            if not cmd.run_local(exec_db, wf_handler, cause_task_db):
-                break
+                return False
        return True
    @staticmethod
    def _run_remote_commands(cmds, exec_db, wf_handler, cause_task_db=None):
        if not cmds:
            return
        for cmd in cmds:
            if not cmd.run_remote(exec_db, wf_handler, cause_task_db):
                return False
        return True
    @staticmethod
    def _create_db_execution(wf_db, wf_spec, wf_input, params):
@ -180,7 +204,11 @@ class DefaultEngine(base.Engine):
            wf_handler = wfh_factory.create_workflow_handler(exec_db)
-            commands.RunTask(task_spec, task_db).run(exec_db, wf_handler)
+            cmd = commands.RunTask(task_spec, task_db)\
            cmd.run_local(exec_db, wf_handler)
        cmd.run_remote(exec_db, wf_handler)
    def _check_subworkflow_completion(self, exec_db):
        if not exec_db.parent_task_id:
--- a/mistral/services/action_manager.py
+++ b/mistral/services/action_manager.py
@ -28,6 +28,8 @@ from mistral import utils
 from mistral.utils import inspect_utils as i_utils
 # TODO(rakhmerov): Make methods more consistent and granular.
 LOG = logging.getLogger(__name__)
 ACTIONS_PATH = '../resources/actions'
@ -48,11 +50,11 @@ def get_registered_actions(**kwargs):
    return db_api.get_actions(**kwargs)
-def _register_action_in_db(name, action_class, attributes,
+def register_action_class(name, action_class_str, attributes,
-                           description=None, input_str=None):
+                          description=None, input_str=None):
    values = {
        'name': name,
-        'action_class': action_class,
+        'action_class': action_class_str,
        'attributes': attributes,
        'description': description,
        'input': input_str,
@ -74,6 +76,7 @@ def _clear_system_action_db():
 def sync_db():
    _clear_system_action_db()
    register_action_classes()
    register_standard_actions()
@ -90,7 +93,7 @@ def _register_dynamic_action_classes():
        for action in actions:
            attrs = i_utils.get_public_fields(action['class'])
-            _register_action_in_db(
+            register_action_class(
                action['name'],
                action_class_str,
                attrs,
@ -114,9 +117,13 @@ def register_action_classes():
            attrs = i_utils.get_public_fields(mgr[name].plugin)
-            _register_action_in_db(name, action_class_str, attrs,
+            register_action_class(
-                                   description=description,
+                name,
-                                   input_str=input_str)
+                action_class_str,
                attrs,
                description=description,
                input_str=input_str
            )
        _register_dynamic_action_classes()
@ -134,8 +141,10 @@ def get_action_class(action_full_name):
    action_db = get_action_db(action_full_name)
    if action_db:
-        return action_factory.construct_action_class(action_db.action_class,
+        return action_factory.construct_action_class(
-                                                     action_db.attributes)
+            action_db.action_class,
            action_db.attributes
        )
 def get_action_context(task_db):
--- a/mistral/tests/unit/engine1/test_dataflow.py
+++ b/mistral/tests/unit/engine1/test_dataflow.py
@ -21,6 +21,7 @@ from mistral.services import workbooks as wb_service
 from mistral.tests.unit.engine1 import base
 LOG = logging.getLogger(__name__)
 # Use the set_default method to set value otherwise in certain test cases
 # the change in value is not permanent.
 cfg.CONF.set_default('auth_enable', False, group='pecan')
@ -28,7 +29,9 @@ cfg.CONF.set_default('auth_enable', False, group='pecan')
 WORKBOOK = """
 ---
 version: '2.0'
 name: wb
 workflows:
  wf1:
    type: direct
@ -72,10 +75,33 @@ class DataFlowEngineTest(base.EngineTestCase):
        self.assertEqual(states.SUCCESS, exec_db.state)
        tasks = exec_db.tasks
        task1 = self._assert_single_item(tasks, name='task1')
        task2 = self._assert_single_item(tasks, name='task2')
        task3 = self._assert_single_item(tasks, name='task3')
        self.assertEqual(states.SUCCESS, task3.state)
        self.assertDictEqual(
            {
                'task': {
                    'task1': {'hi': 'Hi,'},
                },
                'hi': 'Hi,',
            },
            task1.output
        )
        self.assertDictEqual(
            {
                'task': {
                    'task2': {'username': 'Morpheus'},
                },
                'username': 'Morpheus',
            },
            task2.output
        )
        self.assertDictEqual(
            {
                'task': {
--- a/mistral/tests/unit/engine1/test_direct_workflow.py
+++ b/mistral/tests/unit/engine1/test_direct_workflow.py
@ -22,6 +22,7 @@ from mistral.tests.unit.engine1 import base
 # TODO(nmakhotkin) Need to write more tests.
 LOG = logging.getLogger(__name__)
 # Use the set_default method to set value otherwise in certain test cases
 # the change in value is not permanent.
 cfg.CONF.set_default('auth_enable', False, group='pecan')
@ -89,6 +90,7 @@ class DirectWorkflowEngineTest(base.EngineTestCase):
        exec_db = db_api.get_execution(exec_db.id)
        tasks = exec_db.tasks
        task3 = self._assert_single_item(tasks, name='task3')
        task4 = self._assert_single_item(tasks, name='task4')
--- a/mistral/tests/unit/engine1/test_race_condition.py
+++ b/mistral/tests/unit/engine1/test_race_condition.py
@ -0,0 +1,208 @@
 # Copyright 2014 - Mirantis, Inc.
 #
 #    Licensed under the Apache License, Version 2.0 (the "License");
 #    you may not use this file except in compliance with the License.
 #    You may obtain a copy of the License at
 #
 #        http://www.apache.org/licenses/LICENSE-2.0
 #
 #    Unless required by applicable law or agreed to in writing, software
 #    distributed under the License is distributed on an "AS IS" BASIS,
 #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #    See the License for the specific language governing permissions and
 #    limitations under the License.
 from eventlet import corolocal
 from eventlet import semaphore
 from oslo.config import cfg
 from mistral.actions import base as action_base
 from mistral.db.v2 import api as db_api
 from mistral.openstack.common import log as logging
 from mistral.services import action_manager as a_m
 from mistral.services import workflows as wf_service
 from mistral.tests.unit.engine1 import base
 from mistral.workflow import states
 LOG = logging.getLogger(__name__)
 # Use the set_default method to set value otherwise in certain test cases
 # the change in value is not permanent.
 cfg.CONF.set_default('auth_enable', False, group='pecan')
 WF_LONG_ACTION = """
 ---
 version: '2.0'
 wf:
  type: direct
  description: |
    The idea is to use action that runs longer than engine.start_workflow()
    method. And we need to check that engine handles this situation.
  output:
    result: $.result
  tasks:
    task1:
      action: std.block
      publish:
        result: $
 """
 WF_SHORT_ACTION = """
 ---
 version: '2.0'
 wf:
  type: direct
  description: |
    The idea is to use action that runs faster than engine.start_workflow().
    And we need to check that engine handles this situation as well. This was
    a situation previously that led to a race condition in engine, method
    on_task_result() was called while DB transaction in start_workflow() was
    still active (not committed yet).
    To emulate a short action we use a workflow with two start tasks so they
    run both in parallel on the first engine iteration when we call method
    start_workflow(). First task has a short action that just returns a
    predefined result and the second task blocks until the test explicitly
    unblocks it. So the first action will always end before start_workflow()
    methods ends.
  output:
    result: $.result
  tasks:
    task1:
      action: std.echo output=1
      publish:
        result1: $
    task2:
      action: std.block
 """
 ACTION_SEMAPHORE = None
 TEST_SEMAPHORE = None
 class BlockingAction(action_base.Action):
    def __init__(self):
        pass
    @staticmethod
    def unblock_test():
        TEST_SEMAPHORE.release()
    @staticmethod
    def wait_for_test():
        ACTION_SEMAPHORE.acquire()
    def run(self):
        self.unblock_test()
        self.wait_for_test()
        print('Action completed [eventlet_id=%s]' % corolocal.get_ident())
        return 'test'
    def test(self):
        pass
 class LongActionTest(base.EngineTestCase):
    def setUp(self):
        super(LongActionTest, self).setUp()
        global ACTION_SEMAPHORE
        global TEST_SEMAPHORE
        ACTION_SEMAPHORE = semaphore.Semaphore(1)
        TEST_SEMAPHORE = semaphore.Semaphore(0)
        a_m.register_action_class(
            'std.block',
            '%s.%s' % (BlockingAction.__module__, BlockingAction.__name__),
            None
        )
    @staticmethod
    def block_action():
        ACTION_SEMAPHORE.acquire()
    @staticmethod
    def unblock_action():
        ACTION_SEMAPHORE.release()
    @staticmethod
    def wait_for_action():
        TEST_SEMAPHORE.acquire()
    def test_long_action(self):
        wf_service.create_workflows(WF_LONG_ACTION)
        self.block_action()
        exec_db = self.engine.start_workflow('wf', None)
        exec_db = db_api.get_execution(exec_db.id)
        self.assertEqual(states.RUNNING, exec_db.state)
        self.assertEqual(states.RUNNING, exec_db.tasks[0].state)
        self.wait_for_action()
        # Here's the point when the action is blocked but already running.
        # Do the same check again, it should always pass.
        exec_db = db_api.get_execution(exec_db.id)
        self.assertEqual(states.RUNNING, exec_db.state)
        self.assertEqual(states.RUNNING, exec_db.tasks[0].state)
        self.unblock_action()
        self._await(lambda: self.is_execution_success(exec_db.id))
        exec_db = db_api.get_execution(exec_db.id)
        self.assertDictEqual({'result': 'test'}, exec_db.output)
    def test_short_action(self):
        wf_service.create_workflows(WF_SHORT_ACTION)
        self.block_action()
        exec_db = self.engine.start_workflow('wf', None)
        exec_db = db_api.get_execution(exec_db.id)
        self.assertEqual(states.RUNNING, exec_db.state)
        tasks = exec_db.tasks
        task1 = self._assert_single_item(exec_db.tasks, name='task1')
        task2 = self._assert_single_item(
            tasks,
            name='task2',
            state=states.RUNNING
        )
        self._await(lambda: self.is_task_success(task1.id))
        self.unblock_action()
        self._await(lambda: self.is_task_success(task2.id))
        self._await(lambda: self.is_execution_success(exec_db.id))
        task1 = db_api.get_task(task1.id)
        self.assertDictEqual(
            {
                'result1': 1,
                'task': {'task1': {'result1': 1}}
            },
            task1.output
        )
--- a/mistral/utils/inspect_utils.py
+++ b/mistral/utils/inspect_utils.py
@ -22,6 +22,7 @@ def get_public_fields(obj):
                         if not attr.startswith("_")]
    public_fields = {}
    for attribute_str in public_attributes:
        attr = getattr(obj, attribute_str)
        is_field = not (inspect.isbuiltin(attr)
--- a/mistral/workflow/base.py
+++ b/mistral/workflow/base.py
@ -129,7 +129,8 @@ class WorkflowHandler(object):
        return cmds
-    def _determine_task_output(self, task_spec, task_db, raw_result):
+    @staticmethod
    def _determine_task_output(task_spec, task_db, raw_result):
        for_each = task_spec.get_for_each()
        t_name = task_spec.get_name()
@ -172,7 +173,8 @@ class WorkflowHandler(object):
        else:
            return data_flow.evaluate_task_output(task_spec, raw_result)
-    def _determine_task_state(self, task_db, task_spec, raw_result):
+    @staticmethod
    def _determine_task_state(task_db, task_spec, raw_result):
        state = states.ERROR if raw_result.is_error() else states.SUCCESS
        for_each = task_spec.get_for_each()
--- a/mistral/workflow/direct_workflow.py
+++ b/mistral/workflow/direct_workflow.py
@ -88,7 +88,7 @@ class DirectWorkflowHandler(base.WorkflowHandler):
        Expression 'on_complete' is not mutually exclusive to 'on_success'
         and 'on_error'.
        :param task_db: Task DB model.
-        :param remove_incomplete_joins: True if incomplete "join"
+        :param remove_unsatisfied_joins: True if incomplete "join"
            tasks must be excluded from the list of commands.
        :return: List of task specifications.
        """