Catch workflow errors

Avoid keeping workflow in zombie state: on unexpected errors fail the workflow execution and save error information. If possible, fail the correspondent tasks, too. Done: - [x] Introduce state_info into Execution, fill it with error info when workflow fails. - [x] Add error handlers to `on_task_result` and `run_task` - [x] Add error handlers and tests to `start_workflow` - [x] Add tests for YAQL eval exceptions - [x] Add error handler and tests to `resume_worklfow` TODO: - [ ] Make _fail_workflow public and let user fail workflow via API (on separate commit) Closes-bug: 1415886 Change-Id: Ib1fd661580d3a426dbbc7401a3622a98c7840e00
2015-02-01 16:12:47 -08:00 · 2015-02-01 16:12:47 -08:00 · 6a2481c6c6
commit 6a2481c6c6
parent 9300408fb1
6 changed files with 371 additions and 136 deletions
--- a/mistral/api/controllers/v2/execution.py
+++ b/mistral/api/controllers/v2/execution.py
@ -47,6 +47,9 @@ class Execution(resource.Resource):
    state = wtypes.text
    "state can be one of: RUNNING, SUCCESS, ERROR, PAUSED"

+    state_info = wtypes.text
+    "an optional state information string"
+
    input = wtypes.text
    "input is a JSON structure containing workflow input values."
    output = wtypes.text
--- a/mistral/db/v2/sqlalchemy/models.py
+++ b/mistral/db/v2/sqlalchemy/models.py
@ -64,6 +64,7 @@ class Execution(mb.MistralSecureModelBase):
    wf_spec = sa.Column(st.JsonDictType())
    start_params = sa.Column(st.JsonDictType())
    state = sa.Column(sa.String(20))
+    state_info = sa.Column(sa.Text(), nullable=True)
    input = sa.Column(st.JsonDictType())
    output = sa.Column(st.JsonDictType())
    context = sa.Column(st.JsonDictType())
--- a/mistral/engine1/default_engine.py
+++ b/mistral/engine1/default_engine.py
@ -15,6 +15,7 @@

 import copy
 from oslo.config import cfg
+import traceback

 from mistral.db.v2 import api as db_api
 from mistral.engine1 import base
@ -49,7 +50,8 @@ class DefaultEngine(base.Engine):
            "Starting the execution of workflow '%s'"
            % workflow_name
        )
-
+        try:
+            execution_id = None
            params = self._canonize_workflow_params(params)

            with db_api.transaction():
@ -65,8 +67,12 @@ class DefaultEngine(base.Engine):
                    workflow_input,
                    params
                )
+                execution_id = exec_db.id

-            wf_handler = wfh_factory.create_workflow_handler(exec_db, wf_spec)
+                wf_handler = wfh_factory.create_workflow_handler(
+                    exec_db,
+                    wf_spec
+                )

                # Calculate commands to process next.
                cmds = wf_handler.start_workflow(**params)
@ -75,15 +81,28 @@ class DefaultEngine(base.Engine):

            self._run_remote_commands(cmds, exec_db, wf_handler)

+        except Exception as e:
+            LOG.error("Failed to start workflow '%s' id=%s: %s\n%s",
+                      workflow_name, execution_id, e, traceback.format_exc())
+            self._fail_workflow(execution_id, e)
+            raise e
+
        return exec_db

    @u.log_exec(LOG)
    def on_task_result(self, task_id, raw_result):
+        try:
+            task_name = "Unknown"
+            execution_id = None
+
            with db_api.transaction():
                task_db = db_api.get_task(task_id)
+                task_name = task_db.name
                exec_db = db_api.get_execution(task_db.execution_id)
+                execution_id = exec_db.id

-            raw_result = utils.transform_result(exec_db, task_db, raw_result)
+                raw_result = utils.transform_result(
+                    exec_db, task_db, raw_result)
                wf_handler = wfh_factory.create_workflow_handler(exec_db)

                self._after_task_complete(
@ -107,11 +126,54 @@ class DefaultEngine(base.Engine):
                )

            self._run_remote_commands(cmds, exec_db, wf_handler)
-
            self._check_subworkflow_completion(exec_db)

+        except Exception as e:
+            LOG.error("Failed to handle results for task '%s' id=%s: %s\n%s",
+                      task_name, task_id, e, traceback.format_exc())
+            # TODO(dzimine): try to find out which command caused failure.
+            self._fail_workflow(execution_id, e)
+            raise e
+
        return task_db

+    @u.log_exec(LOG)
+    def run_task(self, task_id):
+        try:
+            execution_id = None
+
+            with db_api.transaction():
+                task_db = db_api.get_task(task_id)
+
+                WF_TRACE.info(
+                    "Task '%s' [%s -> %s]"
+                    % (task_db.name, task_db.state, states.RUNNING)
+                )
+
+                task_db = db_api.update_task(
+                    task_id,
+                    {'state': states.RUNNING}
+                )
+
+                task_spec = spec_parser.get_task_spec(task_db.spec)
+
+                exec_db = task_db.execution
+                execution_id = exec_db.id
+
+                wf_handler = wfh_factory.create_workflow_handler(exec_db)
+
+                cmd = commands.RunTask(task_spec, task_db)
+
+                cmd.run_local(exec_db, wf_handler)
+
+            cmd.run_remote(exec_db, wf_handler)
+
+        except Exception as e:
+            LOG.error("Failed to run task '%s': %s\n%s",
+                      task_db.name, e, traceback.format_exc())
+            self._fail_workflow(execution_id, e, task_id)
+            raise e
+
    @u.log_exec(LOG)
    def pause_workflow(self, execution_id):
        with db_api.transaction():
@ -125,6 +187,7 @@ class DefaultEngine(base.Engine):

    @u.log_exec(LOG)
    def resume_workflow(self, execution_id):
+        try:
            with db_api.transaction():
                exec_db = db_api.get_execution(execution_id)

@ -137,6 +200,38 @@ class DefaultEngine(base.Engine):

            self._run_remote_commands(cmds, exec_db, wf_handler)

+        except Exception as e:
+            LOG.error("Failed to resume execution id=%s: %s\n%s",
+                      execution_id, e, traceback.format_exc())
+            self._fail_workflow(execution_id, e)
+            raise e
+
+        return exec_db
+
+    # TODO(dzimine): make public, add to RPC, expose in API
+    def _fail_workflow(self, execution_id, err, task_id=None):
+        with db_api.transaction():
+            err_msg = str(err)
+
+            exec_db = db_api.load_execution(execution_id)
+            if exec_db is None:
+                LOG.error("Cant fail workflow execution id='%s': not found.",
+                          execution_id)
+                return
+
+            wf_handler = wfh_factory.create_workflow_handler(exec_db)
+            wf_handler.fail_workflow(err_msg)
+
+            if task_id:
+                task_db = db_api.get_task(task_id)
+                # Note(dzimine): Don't call self.engine_client:
+                # 1) to avoid computing and triggering next tasks
+                # 2) to avoid a loop in case of error in transport
+                wf_handler.on_task_result(
+                    task_db,
+                    wf_utils.TaskResult(error=err_msg)
+                )
+
        return exec_db

    def rollback_workflow(self, execution_id):
@ -206,29 +301,6 @@ class DefaultEngine(base.Engine):
        for p in policies.build_policies(task_spec.get_policies(), wf_spec):
            p.after_task_complete(task_db, task_spec, raw_result)

-    @u.log_exec(LOG)
-    def run_task(self, task_id):
-        with db_api.transaction():
-            task_db = db_api.get_task(task_id)
-
-            WF_TRACE.info(
-                "Task '%s' [%s -> %s]"
-                % (task_db.name, task_db.state, states.RUNNING)
-            )
-
-            task_db = db_api.update_task(task_id, {'state': states.RUNNING})
-            task_spec = spec_parser.get_task_spec(task_db.spec)
-
-            exec_db = task_db.execution
-
-            wf_handler = wfh_factory.create_workflow_handler(exec_db)
-
-            cmd = commands.RunTask(task_spec, task_db)\
-
-            cmd.run_local(exec_db, wf_handler)
-
-        cmd.run_remote(exec_db, wf_handler)
-
    def _check_subworkflow_completion(self, exec_db):
        if not exec_db.parent_task_id:
            return
--- a/mistral/tests/unit/engine1/test_direct_workflow.py
+++ b/mistral/tests/unit/engine1/test_direct_workflow.py
@ -12,12 +12,17 @@
 #    See the License for the specific language governing permissions and
 #    limitations under the License.

+import mock
 from oslo.config import cfg
+from yaql import exceptions as yaql_exc

 from mistral.db.v2 import api as db_api
+from mistral.engine1 import default_engine as de
+from mistral import exceptions as exc
 from mistral.openstack.common import log as logging
-from mistral.services import workbooks as wb_service
+from mistral.services import workflows as wf_service
 from mistral.tests.unit.engine1 import base
+from mistral.workflow import states

 # TODO(nmakhotkin) Need to write more tests.

@ -27,14 +32,21 @@ LOG = logging.getLogger(__name__)
 # the change in value is not permanent.
 cfg.CONF.set_default('auth_enable', False, group='pecan')

-WORKBOOK = """
---
+
+class DirectWorkflowEngineTest(base.EngineTestCase):
+
+    def _run_workflow(self, worklfow_yaml):
+        wf_service.create_workflows(worklfow_yaml)
+        exec_db = self.engine.start_workflow('wf', {})
+        self._await(lambda: self.is_execution_error(exec_db.id))
+
+        return db_api.get_execution(exec_db.id)
+
+    def test_direct_workflow_on_closures(self):
+        WORKFLOW = """
        version: '2.0'

-name: wb
-
-workflows:
-  wf1:
+        wf:
          type: direct

          tasks:
@ -48,6 +60,7 @@ workflows:
                - task3
                - task4
                - fail
+                - never_gets_here

            task2:
              action: std.echo output="Morpheus"
@ -57,60 +70,171 @@ workflows:

            task4:
              action: std.echo output="output"
+
+            never_gets_here:
+              action: std.noop
        """
-
-
-WORKBOOK_WRONG_TASK_INPUT = """
---
-version: '2.0'
-
-name: wb
-
-workflows:
-  wf:
-    type: direct
-
-    tasks:
-      task1:
-        description: That should lead to workflow fail.
-        action: std.echo outpu="Echo"
-"""
-
-
-class DirectWorkflowEngineTest(base.EngineTestCase):
-    def test_direct_workflow_on_closures(self):
-        wb_service.create_workbook_v2(WORKBOOK)
-
-        # Start workflow.
-        exec_db = self.engine.start_workflow('wb.wf1', {})
-
-        self._await(lambda: self.is_execution_error(exec_db.id))
-
-        # Note: We need to reread execution to access related tasks.
-        exec_db = db_api.get_execution(exec_db.id)
+        exec_db = self._run_workflow(WORKFLOW)

        tasks = exec_db.tasks
-
+        task1 = self._assert_single_item(tasks, name='task1')
        task3 = self._assert_single_item(tasks, name='task3')
        task4 = self._assert_single_item(tasks, name='task4')

        self.assertEqual(3, len(tasks))

+        self._await(lambda: self.is_task_success(task1.id))
        self._await(lambda: self.is_task_success(task3.id))
        self._await(lambda: self.is_task_success(task4.id))

+        self.assertTrue(exec_db.state, states.ERROR)
+
    def test_wrong_task_input(self):
-        wb_service.create_workbook_v2(WORKBOOK_WRONG_TASK_INPUT)
+        WORKFLOW_WRONG_TASK_INPUT = """
+        version: '2.0'

-        # Start workflow.
-        exec_db = self.engine.start_workflow('wb.wf', {})
+        wf:
+          type: direct

-        self._await(lambda: self.is_execution_error(exec_db.id))
+          tasks:
+            task1:
+              action: std.echo output="Echo"
+              on-complete:
+                - task2
+            task2:
+              description: Wrong task output should lead to workflow failure
+              action: std.echo wrong_input="Hahaha"
+        """
+        exec_db = exec_db = self._run_workflow(WORKFLOW_WRONG_TASK_INPUT)
+        task_db2 = exec_db.tasks[1]

-        exec_db = db_api.get_execution(exec_db.id)
+        self.assertIn(
+            "Failed to initialize action",
+            task_db2.output['task'][task_db2.name]
+        )
+        self.assertIn(
+            "unexpected keyword argument",
+            task_db2.output['task'][task_db2.name]
+        )
+
+        self.assertTrue(exec_db.state, states.ERROR)
+        self.assertIn(task_db2.output['error'], exec_db.state_info)
+
+    def test_wrong_first_task_input(self):
+        WORKFLOW_WRONG_FIRST_TASK_INPUT = """
+        version: '2.0'
+
+        wf:
+          type: direct
+
+          tasks:
+            task1:
+              action: std.echo wrong_input="Ha-ha"
+        """
+        exec_db = self._run_workflow(WORKFLOW_WRONG_FIRST_TASK_INPUT)
        task_db = exec_db.tasks[0]

        self.assertIn(
            "Failed to initialize action",
            task_db.output['task'][task_db.name]
        )
+        self.assertIn(
+            "unexpected keyword argument",
+            task_db.output['task'][task_db.name]
+        )
+
+        self.assertTrue(exec_db.state, states.ERROR)
+        self.assertIn(task_db.output['error'], exec_db.state_info)
+
+    def test_wrong_action(self):
+        WORKFLOW_WRONG_ACTION = """
+        version: '2.0'
+
+        wf:
+          type: direct
+          tasks:
+            task1:
+              action: std.echo output="Echo"
+              on-complete:
+                - task2
+
+            task2:
+              action: action.doesnt_exist
+        """
+        exec_db = self._run_workflow(WORKFLOW_WRONG_ACTION)
+
+        # TODO(dzimine): Catch tasks caused error, and set them to ERROR:
+        # TODO(dzimine): self.assertTrue(task_db.state, states.ERROR)
+
+        self.assertTrue(exec_db.state, states.ERROR)
+        self.assertIn("Failed to find action", exec_db.state_info)
+
+    def test_wrong_action_first_task(self):
+        WORKFLOW_WRONG_ACTION_FIRST_TASK = """
+        version: '2.0'
+
+        wf:
+          type: direct
+          tasks:
+            task1:
+              action: wrong.task
+        """
+        wf_service.create_workflows(WORKFLOW_WRONG_ACTION_FIRST_TASK)
+        with mock.patch.object(de.DefaultEngine, '_fail_workflow') as mock_fw:
+            self.assertRaises(
+                exc.InvalidActionException,
+                self.engine.start_workflow, 'wf', None)
+
+            mock_fw.assert_called_once()
+            self.assertTrue(
+                issubclass(
+                    type(mock_fw.call_args[0][1]),
+                    exc.InvalidActionException
+                ),
+                "Called with a right exception"
+            )
+
+    def test_messed_yaql(self):
+        WORKFLOW_MESSED_YAQL = """
+        version: '2.0'
+
+        wf:
+          type: direct
+          tasks:
+            task1:
+              action: std.echo output="Echo"
+              # publish: {wrong yaql}
+              on-complete:
+                - task2
+
+            task2:
+              action: std.echo output="{wrong yaql}"
+        """
+        exec_db = self._run_workflow(WORKFLOW_MESSED_YAQL)
+
+        self.assertTrue(exec_db.state, states.ERROR)
+
+    def test_messed_yaql_in_first_task(self):
+        WORKFLOW_MESSED_YAQL_IN_FIRST_TASK = """
+        version: '2.0'
+
+        wf:
+          type: direct
+          tasks:
+            task1:
+              action: std.echo output="{wrong(yaql)}"
+        """
+        wf_service.create_workflows(WORKFLOW_MESSED_YAQL_IN_FIRST_TASK)
+
+        with mock.patch.object(de.DefaultEngine, '_fail_workflow') as mock_fw:
+            self.assertRaises(
+                yaql_exc.YaqlException, self.engine.start_workflow, 'wf', None)
+
+            mock_fw.assert_called_once()
+            self.assertTrue(
+                issubclass(
+                    type(mock_fw.call_args[0][1]),
+                    yaql_exc.YaqlException
+                ),
+                "Called with a right exception"
+            )
--- a/mistral/tests/unit/engine1/test_workflow_resume.py
+++ b/mistral/tests/unit/engine1/test_workflow_resume.py
@ -12,9 +12,12 @@
 #    See the License for the specific language governing permissions and
 #    limitations under the License.

+import mock
 from oslo.config import cfg

 from mistral.db.v2 import api as db_api
+from mistral.engine1 import default_engine as de
+from mistral import exceptions as exc
 from mistral.openstack.common import log as logging
 from mistral.services import scheduler
 from mistral.services import workbooks as wb_service
@ -300,3 +303,28 @@ class WorkflowResumeTest(base.EngineTestCase):

        self.assertEqual(states.SUCCESS, exec_db.state)
        self.assertEqual(4, len(exec_db.tasks))
+
+    @mock.patch.object(de.DefaultEngine, '_fail_workflow')
+    def test_resume_fails(self, mock_fw):
+        # Start and pause workflow.
+        wb_service.create_workbook_v2(WORKBOOK_DIFFERENT_TASK_STATES)
+        exec_db = self.engine.start_workflow('wb.wf1', {})
+
+        self._await(lambda: self.is_execution_paused(exec_db.id))
+
+        exec_db = db_api.get_execution(exec_db.id)
+        self.assertEqual(states.PAUSED, exec_db.state)
+
+        # Simulate failure and check if it is handled.
+        err = exc.MistralException('foo')
+        with mock.patch.object(
+                de.DefaultEngine,
+                '_run_remote_commands',
+                side_effect=err):
+
+            self.assertRaises(
+                exc.MistralException,
+                self.engine.resume_workflow,
+                exec_db.id
+            )
+            mock_fw.assert_called_once_with(exec_db.id, err)
--- a/mistral/workflow/base.py
+++ b/mistral/workflow/base.py
@ -109,7 +109,13 @@ class WorkflowHandler(object):
        if (task_db.state == states.ERROR and
                not self._is_error_handled(task_db)):
            if not self.is_paused_or_completed():
-                self._set_execution_state(states.ERROR)
+                # TODO(dzimine): pass task_db.result when Model refactored.
+                msg = str(task_db.output.get('error', "Unknown"))
+                self._set_execution_state(
+                    states.ERROR,
+                    "Failure caused by error in task '%s': %s"
+                    % (task_db.name, msg)
+                )

            return []

@ -254,12 +260,12 @@ class WorkflowHandler(object):

        return self.exec_db

-    def fail_workflow(self):
+    def fail_workflow(self, err_msg=None):
        """Stops workflow with ERROR state.

        :return: Execution object.
        """
-        self._set_execution_state(states.ERROR)
+        self._set_execution_state(states.ERROR, err_msg)

        return self.exec_db

@ -286,13 +292,14 @@ class WorkflowHandler(object):
        """
        raise NotImplementedError

-    def _set_execution_state(self, state):
+    def _set_execution_state(self, state, state_info=None):
        cur_state = self.exec_db.state

        if states.is_valid_transition(cur_state, state):
            WF_TRACE.info("Execution of workflow '%s' [%s -> %s]"
                          % (self.exec_db.wf_name, cur_state, state))
            self.exec_db.state = state
+            self.exec_db.state_info = state_info
        else:
            msg = "Can't change workflow state [execution=%s," \
                  " state=%s -> %s]" % (self.exec_db, cur_state, state)