Only attempt deploying nodes that were prepared

When processing a deployment group, the the deployment of nodes was
using the same input and a success against the success_criteria
evaluated after preparing nodes. This lead to situations where nodes
failed to prepare, but were assumed (and thusly failed) for deployment.
This was especially problematic when a timeout was triggered by Shipyard
before Drydock had finished preparing.

This change will only attempt to deploy nodes that were positively
identified as prepared by Drydock. When the timeout scenario is reached,
since there will have been no positive confirmation of successful nodes,
the deployment of nodes will not be attempted. This will also prevent
attempting to deploy nodes that have expicitly failed to prepare.

Additionally, added some TODOs around the concept of cancelling tasks in
Drydock when Shipyard stops due to a timeout, however, this kind of
functionality does not yet exist, so the TODOs serve as a placeholder.

Change-Id: I582abcec62407dc2903d8a4477ea891a9397f1fb
This commit is contained in:
Bryan Strassner 2018-08-29 12:49:53 -05:00
parent bb1db91a31
commit be81162168
3 changed files with 34 additions and 12 deletions

View File

@ -231,6 +231,12 @@ class DrydockBaseOperator(UcpBaseOperator):
# Raise Time Out Exception
if task_status == 'running' and i == end_range:
# TODO(bryan-strassner) If Shipyard has timed out waiting for
# this task to complete, and Drydock has provided a means
# to cancel a task, that cancellation should be done here.
# task_failure only exits with an exception, so this is the
# end of processing in the case of a timeout.
self.task_failure(False)
# Exit 'for' loop if the task is in 'complete' or 'terminated'

View File

@ -126,15 +126,17 @@ class DrydockNodesOperator(DrydockBaseOperator):
self.prep_interval,
self.prep_timeout)
def _execute_deployment(self, group):
def _execute_deployment(self, group, successful_prepared_nodes):
"""Execute the deployment of nodes for the group.
:param group: The DeploymentGroup to deploy
:param successful_prepared_nodes: Nodes for this group that are
successfully prepared by the prepare nodes step.
Returns a QueryTaskResult object
"""
LOG.info("Group %s is deploying nodes", group.name)
self.node_filter = gen_node_name_filter(group.actionable_nodes)
self.node_filter = gen_node_name_filter(successful_prepared_nodes)
task_result = self._execute_task('deploy_nodes',
self.dep_interval,
self.dep_timeout)
@ -375,12 +377,24 @@ def _process_deployment_groups(dgm, prepare_func, deploy_func):
# been marked as failed.
continue
# Continue with deployment
dep_qtr = deploy_func(group)
# Mark successes as deployed
for node_name in dep_qtr.successes:
dgm.mark_node_deployed(node_name)
dgm.fail_unsuccessful_nodes(group, dep_qtr.successes)
if prep_qtr.successes:
# Continue with deployment, only for successfully prepared nodes
dep_qtr = deploy_func(group, prep_qtr.successes)
# Mark successes as deployed
for node_name in dep_qtr.successes:
dgm.mark_node_deployed(node_name)
dgm.fail_unsuccessful_nodes(group, dep_qtr.successes)
else:
# TODO(bryan-strassner) Update this message if Drydock provides
# a way to cancel a task, and that method is employed by
# Shipyard upon timeout.
LOG.info("There were no nodes successfully prepared. "
"Deployment will not be attempted for group %s. "
"Success criteria will be immediately checked. "
"If a timeout in the prepare step has occured, it is "
"possible that Drydock is still attempting the prepare "
"task.",
group.name)
dgm.evaluate_group_succ_criteria(group.name, Stage.DEPLOYED)

View File

@ -196,7 +196,7 @@ def _gen_pe_func(mode, stand_alone=False):
object, it needs to be false, so that the right amount of "self"
matches the invocation.
"""
def _func(group):
def _func(group, *args):
qtr = QueryTaskResult('ti', 'tn')
if mode == 'all-success':
qtr.successes.extend(group.actionable_nodes)
@ -205,7 +205,7 @@ def _gen_pe_func(mode, stand_alone=False):
pass
return qtr
def _func_self(self, group):
def _func_self(self, group, *args):
return _func(group)
if stand_alone:
@ -346,7 +346,8 @@ class TestDrydockNodesOperator:
op.join_wait = 0
group = DeploymentGroup(GROUP_DICT, mock.MagicMock())
group.actionable_nodes = ['node1', 'node2', 'node3']
op._execute_deployment(group)
succ_prep_nodes = ['node1', 'node2', 'node3']
op._execute_deployment(group, succ_prep_nodes)
assert op._execute_task.call_count == 1
assert cns.call_count == 1
@ -364,7 +365,8 @@ class TestDrydockNodesOperator:
op.join_wait = 0
group = DeploymentGroup(GROUP_DICT, mock.MagicMock())
group.actionable_nodes = ['node1', 'node2', 'node3']
task_res = op._execute_deployment(group)
succ_prep_nodes = ['node1', 'node2', 'node3']
task_res = op._execute_deployment(group, succ_prep_nodes)
assert op._execute_task.call_count == 1
assert cns.call_count == 1
assert 'node4 failed to join Kubernetes' in caplog.text