Only attempt deploying nodes that were prepared

When processing a deployment group, the the deployment of nodes was using the same input and a success against the success_criteria evaluated after preparing nodes. This lead to situations where nodes failed to prepare, but were assumed (and thusly failed) for deployment. This was especially problematic when a timeout was triggered by Shipyard before Drydock had finished preparing. This change will only attempt to deploy nodes that were positively identified as prepared by Drydock. When the timeout scenario is reached, since there will have been no positive confirmation of successful nodes, the deployment of nodes will not be attempted. This will also prevent attempting to deploy nodes that have expicitly failed to prepare. Additionally, added some TODOs around the concept of cancelling tasks in Drydock when Shipyard stops due to a timeout, however, this kind of functionality does not yet exist, so the TODOs serve as a placeholder. Change-Id: I582abcec62407dc2903d8a4477ea891a9397f1fb
2018-08-29 12:49:53 -05:00 · 2018-08-29 12:49:53 -05:00 · be81162168
parent bb1db91a31
commit be81162168
3 changed files with 34 additions and 12 deletions
--- a/src/bin/shipyard_airflow/shipyard_airflow/plugins/drydock_base_operator.py
+++ b/src/bin/shipyard_airflow/shipyard_airflow/plugins/drydock_base_operator.py
@ -231,6 +231,12 @@ class DrydockBaseOperator(UcpBaseOperator):

            # Raise Time Out Exception
            if task_status == 'running' and i == end_range:
+                # TODO(bryan-strassner) If Shipyard has timed out waiting for
+                #     this task to complete, and Drydock has provided a means
+                #     to cancel a task, that cancellation should be done here.
+
+                # task_failure only exits with an exception, so this is the
+                # end of processing in the  case of a timeout.
                self.task_failure(False)

            # Exit 'for' loop if the task is in 'complete' or 'terminated'
--- a/src/bin/shipyard_airflow/shipyard_airflow/plugins/drydock_nodes.py
+++ b/src/bin/shipyard_airflow/shipyard_airflow/plugins/drydock_nodes.py
@ -126,15 +126,17 @@ class DrydockNodesOperator(DrydockBaseOperator):
                                  self.prep_interval,
                                  self.prep_timeout)

-    def _execute_deployment(self, group):
+    def _execute_deployment(self, group, successful_prepared_nodes):
        """Execute the deployment of nodes for the group.

        :param group: The DeploymentGroup to deploy
+        :param successful_prepared_nodes: Nodes for this group that are
+            successfully prepared by the prepare nodes step.
        Returns a QueryTaskResult object
        """
        LOG.info("Group %s is deploying nodes", group.name)

-        self.node_filter = gen_node_name_filter(group.actionable_nodes)
+        self.node_filter = gen_node_name_filter(successful_prepared_nodes)
        task_result = self._execute_task('deploy_nodes',
                                         self.dep_interval,
                                         self.dep_timeout)
@ -375,12 +377,24 @@ def _process_deployment_groups(dgm, prepare_func, deploy_func):
            # been marked as failed.
            continue

-        # Continue with deployment
-        dep_qtr = deploy_func(group)
-        # Mark successes as deployed
-        for node_name in dep_qtr.successes:
-            dgm.mark_node_deployed(node_name)
-        dgm.fail_unsuccessful_nodes(group, dep_qtr.successes)
+        if prep_qtr.successes:
+            # Continue with deployment, only for successfully prepared nodes
+            dep_qtr = deploy_func(group, prep_qtr.successes)
+            # Mark successes as deployed
+            for node_name in dep_qtr.successes:
+                dgm.mark_node_deployed(node_name)
+            dgm.fail_unsuccessful_nodes(group, dep_qtr.successes)
+        else:
+            # TODO(bryan-strassner) Update this message if Drydock provides
+            #     a way to cancel a task, and that method is employed by
+            #     Shipyard upon timeout.
+            LOG.info("There were no nodes successfully prepared. "
+                     "Deployment will not be attempted for group %s. "
+                     "Success criteria will be immediately checked. "
+                     "If a timeout in the prepare step has occured, it is "
+                     "possible that Drydock is still attempting the prepare "
+                     "task.",
+                     group.name)
        dgm.evaluate_group_succ_criteria(group.name, Stage.DEPLOYED)


--- a/src/bin/shipyard_airflow/tests/unit/plugins/test_drydock_nodes_operator.py
+++ b/src/bin/shipyard_airflow/tests/unit/plugins/test_drydock_nodes_operator.py
@ -196,7 +196,7 @@ def _gen_pe_func(mode, stand_alone=False):
        object, it needs to be false, so that the right amount of "self"
        matches the invocation.
    """
-    def _func(group):
+    def _func(group, *args):
        qtr = QueryTaskResult('ti', 'tn')
        if mode == 'all-success':
            qtr.successes.extend(group.actionable_nodes)
@ -205,7 +205,7 @@ def _gen_pe_func(mode, stand_alone=False):
            pass
        return qtr

-    def _func_self(self, group):
+    def _func_self(self, group, *args):
        return _func(group)

    if stand_alone:
@ -346,7 +346,8 @@ class TestDrydockNodesOperator:
        op.join_wait = 0
        group = DeploymentGroup(GROUP_DICT, mock.MagicMock())
        group.actionable_nodes = ['node1', 'node2', 'node3']
-        op._execute_deployment(group)
+        succ_prep_nodes = ['node1', 'node2', 'node3']
+        op._execute_deployment(group, succ_prep_nodes)
        assert op._execute_task.call_count == 1
        assert cns.call_count == 1

@ -364,7 +365,8 @@ class TestDrydockNodesOperator:
        op.join_wait = 0
        group = DeploymentGroup(GROUP_DICT, mock.MagicMock())
        group.actionable_nodes = ['node1', 'node2', 'node3']
-        task_res = op._execute_deployment(group)
+        succ_prep_nodes = ['node1', 'node2', 'node3']
+        task_res = op._execute_deployment(group, succ_prep_nodes)
        assert op._execute_task.call_count == 1
        assert cns.call_count == 1
        assert 'node4 failed to join Kubernetes' in caplog.text